From 91211da86379251978da300f463552c1687feac4 Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Sat, 27 May 2023 15:35:26 +0100
Subject: [PATCH 01/73] [camp] huge refactor of yaml creation

---
 flexiznam/camp/sync_data.py | 798 +++++++++---------------------------
 1 file changed, 190 insertions(+), 608 deletions(-)

diff --git a/flexiznam/camp/sync_data.py b/flexiznam/camp/sync_data.py
index 99a33a3..0cd5faf 100644
--- a/flexiznam/camp/sync_data.py
+++ b/flexiznam/camp/sync_data.py
@@ -4,6 +4,7 @@
 from pathlib import Path, PurePosixPath
 import re
 import copy
+import warnings
 import yaml
 from yaml.parser import ParserError
 
@@ -14,134 +15,116 @@
 from flexiznam.utils import clean_recursively
 
 
-def create_yaml(
+def create_yaml_dict(
     root_folder,
-    outfile=None,
-    project="NOT SPECIFIED",
-    mouse="NOT SPECIFIED",
-    overwrite=False,
+    project,
+    genealogy,
+    format_yaml=True,
 ):
-    """Automatically create a yaml file skeleton
+    """Create a yaml dict from a folder
 
-    Goes recursively in root folder and create a set of nested structure
+    Recursively parse a folder and create a yaml dict with the structure of the folder.
 
     Args:
-        root_folder (str or Path): base folder, usually a session but can be a sample
-        outfile (str or Path): target to write the yaml. Do not write file if `None`
-        project (str): name of the project
-        mouse (str): name of the mouse
-        overwrite (bool): overwrite outfile if it exists. Default False.
+        root_folder (str): Path to the folder to parse
+        project (str): Name of the project, used as root of the path in the output
+        genealogy (list): List of strings with the genealogy of root_folder. If
+            root_folder is a recording for instance, genealogy should be (mouse,
+            session).
+        format_yaml (bool, optional): Format the output to be yaml compatible if True,
+            otherwise keep dataset as Dataset object and path as pathlib.Path. Defaults
+            to True.
 
     Returns:
-        yaml_dict (dict): created structure
+        dict: Dictionary with the structure of the folder and automatically detected
+            datasets
     """
-    root_folder = pathlib.Path(root_folder)
-    assert root_folder.is_dir()
-    assert isinstance(project, str)
-    assert isinstance(mouse, str)
-    yaml_dict = dict(project=project, mouse=mouse)
-    yaml_dict["session"] = None
-    # check if we were given a session folder
-    if re.match(r"S\d*", root_folder.stem):
-        yaml_dict["session"] = root_folder.stem
-
-    _find_yaml_struct(root_folder, yaml_dict)
-
-    if outfile is not None:
-        if outfile.is_file() and not overwrite:
-            raise IOError(
-                "File %s already exists. Use `overwrite` to replace." % outfile
-            )
-        with open(outfile, "w") as writer:
-            yaml.dump(yaml_dict, writer)
-
-    return yaml_dict
-
-
-def _find_yaml_struct(path, current_dict):
-    """Parse one level of yaml structure for autogenerating yaml
+    if isinstance(genealogy, str):
+        genealogy = [genealogy]
+    data = _create_yaml_dict(
+        level_folder=root_folder,
+        project=project,
+        genealogy=genealogy,
+        format_yaml=format_yaml,
+        parent_dict=dict(),
+    )
+    out = dict(root_folder=root_folder, root_genealogy=genealogy, children=data)
+    return out
 
-    Args:
-        path: path to the dir to parse
-        current_dict: current level
 
-    Returns:
-        current_dict (do changes in place)
-    """
-    path = Path(path)
-    for el in os.listdir(path):
-        if not (path / el).is_dir():
-            continue
-        # match known recording format
-        m = re.fullmatch(r"R\d\d\d\d\d\d_?(.*)?", el)
-        if m:
-            el_type = "recordings"
-            protocol = m[1] if m[1] is not None else "PROTOCOL NOT SPECIFIED"
-        else:
-            el_type = "samples"
-        subdict = current_dict.get(el_type, {})
-        subdict[el] = dict()
-        if el_type == "recordings":
-            subdict[el]["protocol"] = protocol
-        current_dict[el_type] = subdict
-        _find_yaml_struct(path / el, current_dict[el_type][el])
-    return current_dict
+def _create_yaml_dict(
+    level_folder,
+    project,
+    genealogy,
+    format_yaml,
+    parent_dict,
+):
+    """Private function to create a yaml dict from a folder
 
+    Add a private function to hide the arguments that are used only for recursion
+    (parent_dict)
 
-def parse_yaml(path_to_yaml, raw_data_folder=None, verbose=True):
-    """Read an acquisition yaml and create corresponding datasets
+    See `create_yaml_dict` for documentation
 
     Args:
-        path_to_yaml (str or dict): path to the file to parse or dict of yaml contect
-        raw_data_folder (str): root folder. Typically project folder or folder 
-            containing the mice subfolders
-        verbose (bool): print info while looking for datasets
-
-    Returns:
-        dict: A yaml dictionary with dataset classes
-
+        level_folder (Path): folder to parse
+        project (str): name of the project
+        genealogy (tuple): genealogy of the current folder
+        format_yaml (bool): format results to be yaml compatible or keep Dataset
+            and pathlib.Path objects
+        parent_dict (dict): dict of the parent folder. Used for recursion
     """
-    session_data = _clean_yaml(path_to_yaml)
 
-    if raw_data_folder is None:
-        raw_data_folder = Path(PARAMETERS["data_root"]["raw"])
-        raw_data_folder /= session_data["project"]
-
-    if session_data["path"] is not None:
-        home_folder = Path(raw_data_folder) / session_data["path"]
-    elif session_data["session"] is not None:
-        home_folder = (
-            Path(raw_data_folder) / session_data["mouse"] / session_data["session"]
+    level_folder = Path(level_folder)
+    assert level_folder.is_dir(), "root_folder must be a directory"
+    level_dict = dict()
+    genealogy = list(genealogy)
+
+    level_name = level_folder.stem
+    m = re.fullmatch(r"R\d\d\d\d\d\d_?(.*)?", level_name)
+    if m:
+        level_dict["datatype"] = "recording"
+        level_dict["protocol"] = (
+            m[1] if m[1] is not None else "XXERRORXX PROTOCOL NOT SPECIFIED"
         )
-    else:
-        home_folder = Path(raw_data_folder) / session_data["mouse"]
-        # first load datasets in the session level
-    if not home_folder.is_dir():
-        raise FileNotFoundError("Session directory %s does not exist" % home_folder)
-    session_data["path"] = home_folder
-    session_data["datasets"] = create_dataset(
-        dataset_infos=session_data["datasets"],
-        verbose=verbose,
-        parent=session_data,
-        raw_data_folder=raw_data_folder,
-        error_handling="report",
-    )
+        level_dict["recording_type"] = "XXERRORXX error RECORDING TYPE NOT SPECIFIED"
 
-    for rec_name, recording in session_data["recordings"].items():
-        recording["path"] = str(PurePosixPath(home_folder / rec_name))
-        recording["datasets"] = create_dataset(
-            dataset_infos=recording["datasets"],
-            parent=recording,
-            raw_data_folder=raw_data_folder,
-            verbose=verbose,
-            error_handling="report",
-        )
-
-    session_data["samples"] = _create_sample_datasets(session_data, raw_data_folder)
-
-    # remove the full path that are not needed
-    clean_recursively(session_data)
-    return session_data
+    elif re.fullmatch(r"S\d*", level_name):
+        level_dict["datatype"] = "session"
+    else:
+        level_dict["datatype"] = "sample"
+    level_dict["genealogy"] = genealogy + [level_name]
+    level_dict["path"] = Path(project, *level_dict["genealogy"])
+    if format_yaml:
+        level_dict["path"] = str(PurePosixPath(level_dict["path"]))
+    children = dict()
+    datasets = Dataset.from_folder(level_folder)
+    if datasets:
+        for ds_name, ds in datasets.items():
+            ds.genealogy = genealogy + list(ds.genealogy)
+            if format_yaml:
+                # find path root
+                proot = str(level_folder)[: -len(level_dict["path"])]
+                ds.path = ds.path.relative_to(proot)
+                children[ds_name] = ds.format(mode="yaml")
+                children[ds_name]["path"] = str(
+                    PurePosixPath(children[ds_name]["path"])
+                )
+            else:
+                children[ds_name] = ds
+
+    for child in level_folder.glob("*"):
+        if child.is_dir():
+            _create_yaml_dict(
+                child,
+                project=project,
+                genealogy=genealogy + [level_name],
+                format_yaml=format_yaml,
+                parent_dict=children,
+            )
+    level_dict["children"] = children
+    parent_dict[level_folder.stem] = level_dict
+    return parent_dict
 
 
 def upload_yaml(
@@ -170,518 +153,117 @@ def upload_yaml(
         list of names of entities created/updated
 
     """
-    output = []
-    # if there are errors, I cannot safely parse the yaml
-    errors = find_xxerrorxx(yml_file=source_yaml)
-    if errors:
-        raise SyncYmlError("The yaml file still contains error. Fix it")
-    session_data = parse_yaml(source_yaml, raw_data_folder, verbose)
-    # parsing can created errors, check again
-    errors = find_xxerrorxx(yml_file=source_yaml)
-    if errors:
-        raise SyncYmlError("Invalid yaml. Use `parse_yaml` and fix errors manually.")
+    with open(source_yaml, "r") as f:
+        yaml_data = yaml.safe_load(f)
+
+    # first find the origin
 
-    # first find the mouse
     if flexilims_session is None:
-        flexilims_session = flz.get_flexilims_session(
-            project_id=session_data["project"]
-        )
-    mouse = flz.get_entity(
-        datatype="mouse",
-        name=session_data["mouse"],
+        flexilims_session = flz.get_flexilims_session(project_id=yaml_data["project"])
+
+    origin_name = "_".join(yaml_data["root_genealogy"])
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        origin = flz.get_entity(name=origin_name, flexilims_session=flexilims_session)
+    assert origin is not None, f"`{origin_name}` not found on flexilims"
+    if verbose:
+        print(f"Found origin `{origin_name}` with id `{origin.id}`")
+    # then upload the data recursively
+    _upload_yaml_dict(
+        yaml_data["children"],
+        origin=origin,
+        raw_data_folder=raw_data_folder,
+        log_func=log_func,
         flexilims_session=flexilims_session,
-        format_reply=False,
+        conflicts=conflicts,
+        verbose=verbose,
     )
-    if mouse is None:
-        raise SyncYmlError("Mouse not on flexilims. You must add it manually first")
-
-    # deal with the session
-    if session_data["session"] is not None:
-        m = re.match(r"S(\d{4})(\d\d)(\d\d)", session_data["session"])
-        if m:
-            date = "-".join(m.groups())
-        else:
-            log_func("Cannot parse date for session %s." % session_data["session"])
-            date = "N/A"
-
-    session_data = _trim_paths(session_data, raw_data_folder)
-
-    attributes = session_data.get("attributes", None)
-    if attributes is None:
-        attributes = {}
-    for field in ("path", "notes"):
-        value = session_data.get(field, None)
-        if value is not None:
-            attributes[field] = value
-
-    # if session is not specified, then entries will be added directly as
-    # children of the mouse
-    if session_data["session"] is not None:
-        session = flz.add_experimental_session(
-            parent_name=mouse["name"],
-            session_name=session_data["session"],
-            flexilims_session=flexilims_session,
-            date=date,
-            attributes=attributes,
-            conflicts=conflicts,
-        )
-        root_id = session["id"]
-        output.append(session["name"])
-    else:
-        root_id = mouse["id"]
 
-    # session datasets
-    for ds_name, ds in session_data.get("datasets", {}).items():
-        ds.genealogy = [mouse["name"], session_data["session"], ds_name]
-        ds.project = session_data["project"]
-        ds.origin_id = root_id
-        ds.flexilims_session = flexilims_session
-        ds.update_flexilims(mode="safe")
-        output.append(ds.full_name)
 
-    # now deal with recordings
-    for short_rec_name, rec_data in session_data.get("recordings", {}).items():
-        rec_name = session["name"] + "_" + short_rec_name
-        attributes = rec_data.get("attributes", None)
-        if attributes is None:
-            attributes = {}
-        for field in ["notes", "path", "timestamp"]:
-            value = rec_data.get(field, "")
-            attributes[field] = value if value is not None else ""
-        attributes["genealogy"] = session["attributes"]["genealogy"] + [short_rec_name]
-        rec_type = rec_data.get("recording_type", "unspecified")
-        if not rec_type:
-            rec_type = "unspecified"
-        rec_rep = flz.add_recording(
-            session_id=root_id,
-            recording_type=rec_type,
-            protocol=rec_data.get("protocol", ""),
-            attributes=attributes,
-            recording_name=rec_name,
-            other_relations=None,
-            flexilims_session=flexilims_session,
-            conflicts=conflicts,
-        )
-        output.append(rec_rep["name"])
-
-        # now deal with recordings' datasets
-        for ds_name, ds in rec_data.get("datasets", {}).items():
-            ds.genealogy = [
-                mouse["name"],
-                session_data["session"],
-                short_rec_name,
-                ds_name,
-            ]
-            ds.project = session_data["project"]
-            ds.origin_id = rec_rep["id"]
-            ds.flexilims_session = flexilims_session
-            ds.update_flexilims(mode="safe")
-            output.append(ds.full_name)
-
-    # now deal with samples
-    def add_samples(samples, parent, output=None):
-        # we'll need a utility function to deal with recursion
-        for short_sample_name, sample_data in samples.items():
-
-            # we always use `skip` to add samples
-            sample_rep = flz.add_sample(
-                parent["id"],
-                attributes=attributes,
-                sample_name=short_sample_name,
-                conflicts="skip",
-                flexilims_session=flexilims_session,
-            )
-            if output is not None:
-                output.append(sample_rep["name"])
-            # deal with datasets attached to this sample
-            for ds_name, ds in sample_data.get("datasets", {}).items():
-                ds.genealogy = sample_rep["attributes"]["genealogy"] + [ds_name]
-                ds.project = session_data["project"]
-                ds.origin_id = sample_rep["id"]
-                ds.flexilims_session = flexilims_session
-                ds.update_flexilims(mode="safe")
-                if output is not None:
-                    output.append(ds.full_name)
-            # now add child samples
-            add_samples(sample_data["samples"], sample_rep, output)
-
-    # samples are attached to mice, not sessions
-    add_samples(session_data["samples"], mouse, output=output)
-    return output
-
-
-def write_session_data_as_yaml(session_data, target_file=None, overwrite=False):
-    """Write a session_data dictionary into a yaml
-
-    Args:
-        session_data (dict): dictionary with Dataset instances, as returned by parse_yaml
-        target_file (str): path to the output file (if None, does not write to disk)
-        overwrite (bool): replace target file if it already exists (default False)
-
-    Returns:
-        dict: the pure yaml dictionary
-
-    """
-    out_dict = copy.deepcopy(session_data)
-    clean_recursively(out_dict, keys=["name"], format_dataset=True)
-    if target_file is not None:
-        target_file = Path(target_file)
-        if target_file.exists() and not overwrite:
-            raise IOError("Target file %s already exists" % target_file)
-        with open(target_file, "w") as writer:
-            yaml.dump(out_dict, writer)
-        # temp check:
-        with open(target_file, "r") as reader:
-            writen = yaml.safe_load(reader)
-    return out_dict
-
-
-def create_dataset(
-    dataset_infos, parent, raw_data_folder, verbose=True, error_handling="crash"
+def _upload_yaml_dict(
+    yaml_dict, origin, raw_data_folder, log_func, flexilims_session, conflicts, verbose
 ):
-    """Create dictionary of datasets
-
-    Args:
-        dataset_infos: extra information for reading dataset outside of raw_data_folder
-          or adding optional arguments
-        parent (dict): yaml dictionary of the parent level
-        raw_data_folder (str): folder where to look for data
-        verbose (bool): (True) Print info about dataset found
-        error_handling (str) `crash` or `report`. When something goes wrong, raise an
-            error if `crash` otherwise replace the dataset instance by the error
-            message in the output dictionary
-
-    Returns:
-        dict: dictionary of dataset instances
-
-    """
-
-    # autoload datasets
-    datasets = Dataset.from_folder(parent["path"], verbose=verbose)
-    error_handling = error_handling.lower()
-    if error_handling not in ("crash", "report"):
-        raise IOError("error_handling must be `crash` or `report`")
-
-    # check dataset_infos for extra datasets
-    for ds_name, ds_data in dataset_infos.items():
-        ds_path = Path(raw_data_folder) / ds_data["path"]
-        # first deal with dataset that are not in parent path
-        ds_class = Dataset.SUBCLASSES.get(ds_data["dataset_type"], Dataset)
-        if ds_path.is_dir() and (ds_path != parent["path"]):
-            ds = ds_class.from_folder(ds_path, verbose=verbose)
-        elif ds_path.is_file() and (ds_path.parent != parent["path"]):
-            ds = ds_class.from_folder(ds_path.parent, verbose=verbose)
-        elif not ds_path.exists():
-            err_msg = "Dataset not found. Path %s does not exist" % ds_path
-            if error_handling == "crash":
-                raise FileNotFoundError(err_msg)
-            datasets[ds_name] = "XXERRORXX!! " + err_msg
-            continue
-        else:
-            # if it is in the parent['path'] folder, I already loaded it.
-            ds = {k: v for k, v in datasets.items() if isinstance(v, ds_class)}
-        if not ds:
-            err_msg = 'Dataset "%s" not found in %s' % (ds_name, ds_path)
-            if error_handling == "crash":
-                raise SyncYmlError(err_msg)
-            datasets[ds_name] = "XXERRORXX!! " + err_msg
-
-        # match by name
-        if ds_name in ds:
-            ds = ds[ds_name]
-        else:  # now we're in trouble.
-            err_msg = 'Could not find dataset "%s". Found "%s" instead' % (
-                ds_name,
-                ", ".join(ds.keys()),
+    for entity, entity_data in yaml_dict.items():
+        children = entity_data.pop("children", {})
+        datatype = entity_data.pop("datatype")
+        if datatype == "session":
+            if verbose:
+                print(f"Adding session `{entity}`")
+            new_entity = flz.add_experimental_session(
+                date=entity[1:],
+                flexilims_session=flexilims_session,
+                parent_id=origin["id"],
+                attributes=entity_data,
+                session_name=entity,
+                conflicts=conflicts,
             )
-            if error_handling == "crash":
-                raise SyncYmlError(err_msg)
-            datasets[ds_name] = "XXERRORXX!! " + err_msg
-            continue
-        if ds_data["attributes"] is not None:
-            ds.extra_attributes.update(ds_data["attributes"])
-        if ds_data["notes"] is not None:
-            ds.extra_attributes["notes"] = ds_data["notes"]
-        datasets[ds_name] = ds
-    return datasets
-
-
-def _trim_paths(session_data, raw_data_folder):
-    """Parses paths to make them relative to `raw_data_folder`
-
-    Args:
-        session_data (dict): dictionary containing children of the session
-        raw_data_folder (str): part of the path to be omitted from on flexilims
-
-    Returns:
-        dict: `session_data` after trimming the paths
-
-    """
-
-    def trim_sample_paths(samples):
-        # utility function to recurse into samples
-        for sample_name, sample_data in samples.items():
-            samples[sample_name]["path"] = str(
-                PurePosixPath(
-                    Path(samples[sample_name]["path"]).relative_to(raw_data_folder)
+        elif datatype == "recording":
+            rec_type = entity_data.pop("recording_type", "Not specified")
+            prot = entity_data.pop("protocol", "Not specified")
+            if verbose:
+                print(
+                    f"Adding recording `{entity}`, type `{rec_type}`, protocol `{prot}`"
                 )
+            new_entity = flz.add_recording(
+                session_id=origin["id"],
+                recording_type=rec_type,
+                protocol=prot,
+                attributes=entity_data,
+                recording_name=entity,
+                conflicts=conflicts,
+                flexilims_session=flexilims_session,
             )
-            for ds_name, ds in sample_data.get("datasets", {}).items():
-                ds.path = PurePosixPath(ds.path.relative_to(raw_data_folder))
-            trim_sample_paths(sample_data["samples"])
-
-    if raw_data_folder is None:
-        raw_data_folder = Path(PARAMETERS["data_root"]["raw"])
-    if "path" in session_data.keys():
-        session_data["path"] = str(
-            PurePosixPath(Path(session_data["path"]).relative_to(raw_data_folder))
-        )
-    for ds_name, ds in session_data.get("datasets", {}).items():
-        ds.path = ds.path.relative_to(raw_data_folder)
-    for rec_name, rec_data in session_data["recordings"].items():
-        session_data["recordings"][rec_name]["path"] = str(
-            PurePosixPath(
-                Path(session_data["recordings"][rec_name]["path"]).relative_to(
-                    raw_data_folder
-                )
+        elif datatype == "sample":
+            if verbose:
+                print(f"Adding sample `{entity}`")
+            new_entity = flz.add_sample(
+                parent_id=origin["id"],
+                attributes=entity_data,
+                sample_name=entity,
+                conflicts=conflicts,
+                flexilims_session=flexilims_session,
+            )
+        elif datatype == "dataset":
+            created = entity_data.pop("created")
+            dataset_type = entity_data.pop("dataset_type")
+            path = entity_data.pop("path")
+            genealogy = entity_data.pop("genealogy")
+            if verbose:
+                print(f"Adding dataset `{entity}`, type `{dataset_type}`")
+            new_entity = flz.add_dataset(
+                parent_id=origin["id"],
+                dataset_type=dataset_type,
+                created=created,
+                path=path,
+                genealogy=genealogy,
+                is_raw="yes",
+                project_id=None,
+                flexilims_session=None,
+                dataset_name=None,
+                attributes=None,
+                strict_validation=False,
+                conflicts="append",
             )
-        )
-        for ds_name, ds in rec_data.get("datasets", {}).items():
-            ds.path = PurePosixPath(ds.path.relative_to(raw_data_folder))
-    trim_sample_paths(session_data["samples"])
-    return session_data
-
-
-def _create_sample_datasets(parent, raw_data_folder):
-    """Recursively index samples creating a nested dictionary and generate
-    corresponding datasets
-
-    Args:
-        parent (dict): Dictionary corresponding to the parent entity
-
-    Return:
-        dict: dictionary of child samples
 
-    """
-    if "samples" not in parent:
-        return dict()
-    for sample_name, sample in parent["samples"].items():
-        sample["path"] = parent["path"] / sample_name
-        sample["datasets"] = create_dataset(
-            dataset_infos=sample["datasets"],
-            parent=sample,
+        _upload_yaml_dict(
+            yaml_dict=children,
+            origin=new_entity,
             raw_data_folder=raw_data_folder,
-            error_handling="report",
-        )
-
-        # recurse into child samples
-        sample["samples"] = _create_sample_datasets(sample, raw_data_folder)
-    # we update in place but we also return the dictionary of samples to make
-    # for more readable code
-    return parent["samples"]
-
-
-def _clean_yaml(path_to_yaml):
-    """Read a yaml file and check that it is correctly formatted
-
-    This does not do any processing, just make sure that I can read the whole yaml and
-    generate dictionary will all expected fields
-
-    Args:
-        path_to_yaml (str): path to the YAML file, or dict of the yaml content
-
-    Returns:
-        dict: nested dictionary containing entries in the YAML file
-
-    """
-
-    if isinstance(path_to_yaml, dict):
-        yml_data = path_to_yaml
-    else:
-        with open(path_to_yaml, "r") as yml_file:
-            try:
-                yml_data = yaml.safe_load(yml_file)
-            except ParserError as e:
-                raise IOError("Invalid yaml. Parser returned an error: %s" % e)
-
-    session, nested_levels = _read_level(yml_data)
-
-    session["datasets"] = {}
-    for dataset_name, dataset_dict in nested_levels["datasets"].items():
-        session["datasets"][dataset_name] = _read_dataset(
-            name=dataset_name, data=dataset_dict
-        )
-
-    session["recordings"] = {}
-    for rec_name, rec_dict in nested_levels["recordings"].items():
-        session["recordings"][rec_name] = _read_recording(name=rec_name, data=rec_dict)
-
-    session["samples"] = {}
-    for sample_name, sample_dict in nested_levels["samples"].items():
-        session["samples"][sample_name] = _read_sample(
-            name=sample_name, data=sample_dict
-        )
-
-    return session
-
-
-def _read_sample(name, data):
-    """Read YAML information corresponding to a sample
-
-    Args:
-        name (str): the name of the sample
-        data (dict): data for this sample only
-
-    Returns:
-        dict: the sample read from the yaml
-
-    """
-    if data is None:
-        data = {}
-    sample, nested_levels = _read_level(
-        data,
-        mandatory_args=(),
-        optional_args=("notes", "attributes", "path"),
-        nested_levels=("datasets", "samples"),
-    )
-    sample["name"] = name
-
-    sample["datasets"] = dict()
-    for ds_name, ds_data in nested_levels["datasets"].items():
-        sample["datasets"][ds_name] = _read_dataset(name=ds_name, data=ds_data)
-    sample["samples"] = dict()
-    for sample_name, sample_data in nested_levels["samples"].items():
-        sample["samples"][sample_name] = _read_sample(
-            name=sample_name, data=sample_data
+            log_func=log_func,
+            flexilims_session=flexilims_session,
+            conflicts=conflicts,
+            verbose=verbose,
         )
-    return sample
-
-
-def _read_recording(name, data):
-    """Read YAML information corresponding to a recording
-
-    Args:
-        name (str): the name of the recording
-        data (dict): data for this dataset only
-
-    Returns:
-        dict: the recording read from the yaml
-
-    """
-    recording, datasets = _read_level(
-        data,
-        mandatory_args=("protocol",),
-        optional_args=("notes", "attributes", "path", "recording_type", "timestamp"),
-        nested_levels=("datasets",),
-    )
-    recording["name"] = name
-
-    # if timestamps is None, the name must start with RHHMMSS
-    if recording["timestamp"] is None:
-        m = re.match(r"R(\d\d\d\d\d\d)", recording["name"])
-        if not m:
-            raise SyncYmlError(
-                "Timestamp must be provided if recording name is not "
-                "properly formatted"
-            )
-        recording["timestamp"] = m.groups()[0]
-    recording["datasets"] = dict()
-    for ds_name, ds_data in datasets["datasets"].items():
-        recording["datasets"][ds_name] = _read_dataset(name=ds_name, data=ds_data)
-
-    return recording
-
-
-def _read_dataset(name, data):
-    """Read YAML information corresponding to a dataset
-
-    Args:
-        name (str): the name of the dataset, will be composed with parent names to
-        generate an identifier
-        data (dict): data for this dataset only
 
-    Returns:
-        dict: a formatted dictionary including,  'dataset_type', 'path', 'notes',
-        'attributes' and 'name'
 
-    """
-    level, _ = _read_level(
-        data,
-        mandatory_args=("dataset_type", "path"),
-        optional_args=(
-            "notes",
-            "attributes",
-            "created",
-            "is_raw",
-            "origin_id",
-            "genealogy",
-        ),
-        nested_levels=(),
+if __name__ == "__main__":
+    data = create_yaml_dict(
+        "/Volumes/lab-znamenskiyp/data/instruments/raw_data/projects/blota_onix_pilote/BRAC7448.2d/S20230412",
+        project="blota_onix_pilote",
+        genealogy="BRAC7448.2d",
     )
-    level["name"] = name
-    return level
-
-
-def _read_level(
-    yml_level,
-    mandatory_args=("project", "mouse", "session"),
-    optional_args=("path", "notes", "attributes", "genealogy"),
-    nested_levels=("recordings", "datasets", "samples"),
-):
-    """Read one layer of the yml file (i.e. a dictionary)
-
-    Args:
-        yml_level (dict): a dictionary containing the yml level to analyse (and all sublevels)
-        mandatory_args: arguments that must be in this level
-        optional_args: arguments that are expected but not mandatory, will be `None` if
-            absent
-        nested_levels: name of any nested level that should not be parsed
-
-    Returns:
-        (tuple): a tuple containing two dictionaries:
-            level (dict): dictionary of top level attributes
-            nested_levels (dict): dictionary of nested dictionaries
-    """
-    # make a copy to not change original version
-    yml_level = yml_level.copy()
-    is_absent = [m not in yml_level for m in mandatory_args]
-    if any(is_absent):
-        absents = ", ".join(["%s" % a for a, m in zip(mandatory_args, is_absent) if m])
-        raise SyncYmlError("%s must be provided in the YAML file." % absents)
-    level = {m: yml_level.pop(m) for m in mandatory_args}
-
-    for opt in optional_args:
-        level[opt] = yml_level.pop(opt, None)
-
-    nested_levels = {n: yml_level.pop(n, {}) for n in nested_levels}
-
-    # the rest is unexpected
-    if len(yml_level):
-        raise SyncYmlError(
-            "Got unexpected attribute(s): %s" % (", ".join(yml_level.keys()))
-        )
-    return level, nested_levels
-
-
-def find_xxerrorxx(yml_file=None, yml_data=None, pattern="XXERRORXX", _output=None):
-    """Utility to find where things went wrong
-
-    Look through a `yml_file` or the corresponding `yml_Data` dictionary recursively.
-    Returns a dictionary with all entries containing the error `pattern`
-
-    _output is used for recursive calling.
-    """
-    if yml_file is not None:
-        if yml_data is not None:
-            raise IOError("Set either yml_file OR yml_data")
-        with open(yml_file, "r") as reader:
-            yml_data = yaml.safe_load(reader)
-
-    if _output is None:
-        _output = dict()
-    for k, v in yml_data.items():
-        if isinstance(v, dict):
-            _output = find_xxerrorxx(yml_data=v, pattern=pattern, _output=_output)
-        elif isinstance(v, str) and (pattern in v):
-            _output[k] = v
-    return _output
+    with open("test.yml", "w") as writer:
+        yaml.safe_dump(data, writer)
+    print("done")
+    flm_sess = flz.get_flexilims_session(project_id="blota_onix_pilote")
+    upload_yaml("test.yml", conflicts="overwrite", flexilims_session=flm_sess)

From 6d9e94349d11c93aab4b565beac62e81a3cde8f9 Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Sun, 28 May 2023 09:14:04 +0100
Subject: [PATCH 02/73] Add GUI module with tkinter

---
 CHANGELOG.md              |   1 +
 flexiznam/gui/azure.tcl   |  87 ++++++++++++
 flexiznam/gui/flexigui.py | 277 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 365 insertions(+)
 create mode 100644 flexiznam/gui/azure.tcl
 create mode 100644 flexiznam/gui/flexigui.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 74a5ce8..3c7bd61 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,7 @@
 - `flz.get_datasets` can return `Dataset` objects instead of path strings if 
   `return_paths=False`
 - New `OnixData` class to handle Onix data
+- Add a GUI module.
 
 ### Minor
 - `CameraDataset` metadata can also be `.yml`, not only `.txt`.
diff --git a/flexiznam/gui/azure.tcl b/flexiznam/gui/azure.tcl
new file mode 100644
index 0000000..3e75502
--- /dev/null
+++ b/flexiznam/gui/azure.tcl
@@ -0,0 +1,87 @@
+# Copyright © 2021 rdbende <rdbende@gmail.com>
+
+source [file join [file dirname [info script]] theme light.tcl]
+source [file join [file dirname [info script]] theme dark.tcl]
+
+option add *tearOff 0
+
+proc set_theme {mode} {
+	if {$mode == "dark"} {
+		ttk::style theme use "azure-dark"
+
+		array set colors {
+            -fg             "#ffffff"
+            -bg             "#333333"
+            -disabledfg     "#ffffff"
+            -disabledbg     "#737373"
+            -selectfg       "#ffffff"
+            -selectbg       "#007fff"
+        }
+        
+        ttk::style configure . \
+            -background $colors(-bg) \
+            -foreground $colors(-fg) \
+            -troughcolor $colors(-bg) \
+            -focuscolor $colors(-selectbg) \
+            -selectbackground $colors(-selectbg) \
+            -selectforeground $colors(-selectfg) \
+            -insertcolor $colors(-fg) \
+            -insertwidth 1 \
+            -fieldbackground $colors(-selectbg) \
+            -font {"Segoe Ui" 10} \
+            -borderwidth 1 \
+            -relief flat
+
+        tk_setPalette background [ttk::style lookup . -background] \
+            foreground [ttk::style lookup . -foreground] \
+            highlightColor [ttk::style lookup . -focuscolor] \
+            selectBackground [ttk::style lookup . -selectbackground] \
+            selectForeground [ttk::style lookup . -selectforeground] \
+            activeBackground [ttk::style lookup . -selectbackground] \
+            activeForeground [ttk::style lookup . -selectforeground]
+
+        ttk::style map . -foreground [list disabled $colors(-disabledfg)]
+
+        option add *font [ttk::style lookup . -font]
+        option add *Menu.selectcolor $colors(-fg)
+    
+	} elseif {$mode == "light"} {
+		ttk::style theme use "azure-light"
+
+        array set colors {
+            -fg             "#000000"
+            -bg             "#ffffff"
+            -disabledfg     "#737373"
+            -disabledbg     "#ffffff"
+            -selectfg       "#ffffff"
+            -selectbg       "#007fff"
+        }
+
+		ttk::style configure . \
+            -background $colors(-bg) \
+            -foreground $colors(-fg) \
+            -troughcolor $colors(-bg) \
+            -focuscolor $colors(-selectbg) \
+            -selectbackground $colors(-selectbg) \
+            -selectforeground $colors(-selectfg) \
+            -insertcolor $colors(-fg) \
+            -insertwidth 1 \
+            -fieldbackground $colors(-selectbg) \
+            -font {"Segoe Ui" 10} \
+            -borderwidth 1 \
+            -relief flat
+
+        tk_setPalette background [ttk::style lookup . -background] \
+            foreground [ttk::style lookup . -foreground] \
+            highlightColor [ttk::style lookup . -focuscolor] \
+            selectBackground [ttk::style lookup . -selectbackground] \
+            selectForeground [ttk::style lookup . -selectforeground] \
+            activeBackground [ttk::style lookup . -selectbackground] \
+            activeForeground [ttk::style lookup . -selectforeground]
+
+        ttk::style map . -foreground [list disabled $colors(-disabledfg)]
+
+        option add *font [ttk::style lookup . -font]
+        option add *Menu.selectcolor $colors(-fg)
+	}
+}
diff --git a/flexiznam/gui/flexigui.py b/flexiznam/gui/flexigui.py
new file mode 100644
index 0000000..eea1fd2
--- /dev/null
+++ b/flexiznam/gui/flexigui.py
@@ -0,0 +1,277 @@
+import os
+import tkinter as tk
+from tkinter import ttk
+from ttkwidgets import CheckboxTreeview
+import yaml
+from pathlib import Path
+import flexiznam as flz
+import flexiznam.camp.sync_data
+
+
+class FlexiGui(tk.Tk):
+
+    FLEXILIMS_ONLY_FIELDS = ("children", "project", "origin_id")
+
+    def __init__(self):
+        super().__init__()
+
+        self.title("FlexiZnam GUI")
+        self.geometry("800x600")
+
+        self.rowconfigure(1, weight=10)
+        self.columnconfigure(0, weight=1)
+        self.columnconfigure(1, weight=3)
+
+        self.frames = dict()
+        self._create_frames()
+        self._setup_widgets()
+        self._entity_by_itemid = {}
+
+    def _setup_widgets(self):
+        self._create_frames()
+        self._create_buttons()
+        self._create_treeview()
+        self._create_textview()
+
+    def _create_frames(self):
+        self.frames["t"] = tk.Frame(self)
+        self.frames["t"].grid(
+            row=0, column=0, padx=10, pady=5, columnspan=2, sticky="nwe"
+        )
+        self.frames["t"].rowconfigure(0, weight=1)
+        self.frames["t"].rowconfigure(1, weight=1)
+        for i in range(10):
+            self.frames["t"].columnconfigure(i, weight=1)
+        self.frames["t"].columnconfigure(3, weight=10)
+        self.frames["bl"] = tk.Frame(self)
+        self.frames["bl"].grid(row=1, column=0, padx=10, pady=5, sticky="nsew")
+        self.frames["bl"].rowconfigure(0, weight=1)
+        self.frames["bl"].columnconfigure(0, weight=1)
+        self.frames["br"] = tk.Frame(self)
+        self.frames["br"].grid(row=1, column=1, padx=10, pady=5, sticky="nsew")
+        self.frames["br"].rowconfigure(0, weight=1)
+        self.frames["br"].rowconfigure(1, weight=30)
+        self.frames["br"].rowconfigure(2, weight=1)
+        self.frames["br"].columnconfigure(0, weight=1)
+
+    def _create_treeview(self):
+        # Create the Treeview
+        self.treeview = CheckboxTreeview(
+            self.frames["bl"],
+            columns=("datatype",),
+            selectmode="browse",
+        )
+
+        self.treeview.grid(row=0, column=0, sticky="nsew")
+        self.treeview.heading("datatype", text="Datatype")
+        self.treeview.column("datatype", width=200)
+        # Bind the Treeview selection event
+        self.treeview.bind("<<TreeviewSelect>>", self.on_treeview_select)
+        self.treeview.tag_configure("error", background="red")
+
+    def _create_textview(self):
+
+        # Create the Text widget
+        tk.Label(self.frames["br"], text="Selected item:").grid(
+            row=0,
+            column=0,
+            sticky="nw",
+        )
+        self.selected_item = tk.StringVar()
+        self.selected_item.set("None")
+        l = tk.Label(self.frames["br"], textvariable=self.selected_item)
+        l.grid(row=0, column=1, sticky="new")
+        self.textview = tk.Text(self.frames["br"], width=40, height=10, wrap="none")
+        self.textview.grid(row=1, column=0, sticky="nsew", columnspan=2)
+        self.textview.bind("<<Modified>>", self.on_textview_change)
+        self.update_item_btn = tk.Button(
+            self.frames["br"], text="Update item", command=self.update_item
+        )
+        self.update_item_btn.grid(row=2, column=1, sticky="nsw")
+
+    def parse_folder(self):
+        genealogy = self.genealogy.get()
+        if genealogy.startswith("ENTER COMMA"):
+            tk.messagebox.showerror("Error", "Error: enter genealogy first!")
+            return
+        project = self.project.get()
+        if project == "SELECT PROJECT":
+            tk.messagebox.showerror("Error", "Error: select project first!")
+            return
+        genealogy = [g.strip() for g in genealogy.split(",")]
+        self.root_folder.set(
+            tk.filedialog.askdirectory(
+                initialdir=self.root_folder.get(), title="Select directory to parse"
+            )
+        )
+        data = flz.camp.sync_data.create_yaml_dict(
+            root_folder=self.root_folder.get(),
+            project=project,
+            genealogy=genealogy,
+            format_yaml=True,
+        )
+        self.data = data
+        self.update_data()
+
+    def _create_buttons(self):
+        topf = self.frames["t"]
+        self.parse_btn = tk.Button(topf, text="Parse folder", command=self.parse_folder)
+        self.parse_btn.grid(row=0, column=0, sticky="w")
+        self.load_btn = tk.Button(topf, text="Load yaml", command=self.load_yaml)
+        self.load_btn.grid(row=0, column=1, sticky="w")
+        self.write_btn = tk.Button(topf, text="Write yaml", command=self.write_yaml)
+        self.write_btn.grid(row=0, column=2)
+
+        # add project dropdown and label
+        tk.Label(topf, text="Project:").grid(row=0, column=3, sticky="w")
+        self.project = tk.StringVar(self)
+        self.project.set("SELECT PROJECT")
+        self.proj_ddwn = tk.OptionMenu(
+            topf,
+            self.project,
+            "SELECT PROJECT",
+            *flz.PARAMETERS["project_ids"].keys(),
+        ).grid(row=0, column=4, columnspan=3, sticky="w")
+        self.upload_btn = tk.Button(topf, text="Upload to flexilims")
+        self.upload_btn.grid(row=0, column=7)
+
+        self.quit_btn = tk.Button(topf, text="Quit", command=self.quit)
+        self.quit_btn.grid(row=0, column=10, sticky="e")
+
+        # add genealogy and root dir
+        tk.Label(topf, text="Genealogy:").grid(row=1, column=0, sticky="w")
+        self.genealogy = tk.StringVar(self)
+        self.genealogy.set("ENTER COMMA SEPARATED GENEALOGY")
+        self.genealogy_entry = tk.Entry(topf, textvariable=self.genealogy)
+        self.genealogy_entry.grid(row=1, column=1, columnspan=3, sticky="nsew")
+        tk.Label(topf, text="Root directory:").grid(row=1, column=4, sticky="w")
+        self.root_folder = tk.StringVar(self)
+        self.root_folder.set(os.getcwd())
+        self.root_folder_entry = tk.Entry(topf, textvariable=self.root_folder)
+        self.root_folder_entry.grid(row=1, column=5, columnspan=5, sticky="nsew")
+        self.chg_dir_btn = tk.Button(topf, text="...", command=self.chg_root_folder)
+        self.chg_dir_btn.grid(row=1, column=10)
+
+    def chg_root_folder(self):
+        self.root_folder.set(
+            tk.filedialog.askdirectory(
+                initialdir=self.root_folder.get(), title="Select root directory"
+            )
+        )
+
+    def on_treeview_select(self, event):
+        item = self.treeview.focus()
+        name, data = self._entity_by_itemid[item]
+        self.selected_item.set(name)
+        display = {k: v for k, v in data.items() if k not in self.FLEXILIMS_ONLY_FIELDS}
+        self.textview.delete(1.0, tk.END)
+        self.textview.insert(tk.END, yaml.dump(display))
+
+    def on_textview_change(self, event):
+        print('Textview changed: "{}"'.format(event))
+
+    def load_yaml(self):
+        """Load a YAML file and display it in the treeview"""
+        print("Select YAML file to load")
+        filetypes = (("Yaml files", "*.yml *.yaml"), ("All files", "*.*"))
+
+        self.filename = tk.filedialog.askopenfilename(
+            title="Select YAML file to load", filetypes=filetypes
+        )
+        with open(self.filename, "r") as f:
+            self.data = yaml.safe_load(f)
+        print('Loaded YAML file "{}"'.format(self.filename))
+        self.update_data()
+
+    def update_data(self, name_to_select=None):
+        """Update GUI data from self.data
+
+        Args:
+            name_to_select (str, optional): Name of item to select in treeview.
+                Defaults to None."""
+        self.textview.delete("1.0", tk.END)
+        self.selected_item.set("None")
+        self.treeview.delete(*self.treeview.get_children())
+        self._entity_by_itemid = {}
+        self._insert_yaml_data(self.data["children"], name_to_select=name_to_select)
+
+    def _insert_yaml_data(self, data, parent="", name_to_select=None):
+        assert isinstance(data, dict), "data must be a dict"
+        for child, child_data in data.items():
+            assert "type" in child_data, f"datatype missing for {child}"
+            dtype = child_data["type"]
+            item = self.treeview.insert(
+                parent,
+                "end",
+                text=child,
+                values=[dtype],
+                open=True,
+            )
+            if any(
+                [
+                    v.startswith("XXERRORXX")
+                    for v in child_data.values()
+                    if isinstance(v, str)
+                ]
+            ):
+                self.treeview.item(item, tags=("error",))
+
+            self._entity_by_itemid[item] = (child, child_data)
+            if name_to_select and child == name_to_select:
+                self.treeview.focus(item)
+                self.treeview.selection_set(item)
+
+            if "children" in child_data:
+                self._insert_yaml_data(
+                    child_data["children"], parent=item, name_to_select=name_to_select
+                )
+
+    def write_yaml(self):
+        """Write the current data to a YAML file"""
+        target = tk.filedialog.asksaveasfilename(
+            initialdir=self.root_folder.get(),
+            title="Select YAML file to write",
+            filetypes=(("Yaml files", "*.yml *.yaml"), ("All files", "*.*")),
+        )
+        data = dict(self.data)
+        data["project"] = self.project.get()
+        data["root_folder"] = self.root_folder.get()
+        with open(target, "w") as f:
+            yaml.dump(data, f)
+        print('Wrote YAML file "{}"'.format(target))
+
+    def update_item(self):
+        """Update the selected item with the textview contents"""
+        text = self.textview.get(1.0, tk.END)
+        if not text.strip():
+            return
+        item = self.treeview.focus()
+        name, original_data = self._entity_by_itemid[item]
+        assert name == self.selected_item.get(), "Selected item does not match"
+        data = yaml.safe_load(text)
+        for field in self.FLEXILIMS_ONLY_FIELDS:
+            if field in original_data:
+                data[field] = original_data[field]
+        self._entity_by_itemid[item] = (name, data)
+        parents = []
+        parent_id = item
+        while True:
+            parent = self.treeview.parent(parent_id)
+            if not parent:
+                break
+            parents.append(self._entity_by_itemid[parent][0])
+            parent_id = parent
+        ref = self.data
+        for parent in reversed(parents):
+            ref = ref["children"][parent]
+        ref["children"][name] = data
+        self.update_data(name_to_select=name)
+
+
+if __name__ == "__main__":
+    app = FlexiGui()
+    with open("test.yml", "r") as f:
+        data = yaml.safe_load(f)
+    app.data = data
+    app.update_data()
+    app.mainloop()

From 6cbf2dca6e7f2a57abd99d3379a3292698ff4d38 Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Sun, 28 May 2023 09:16:14 +0100
Subject: [PATCH 03/73] Use "type" instead of "datatype"

As flexililims does
---
 flexiznam/camp/sync_data.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/flexiznam/camp/sync_data.py b/flexiznam/camp/sync_data.py
index 0cd5faf..abcf05a 100644
--- a/flexiznam/camp/sync_data.py
+++ b/flexiznam/camp/sync_data.py
@@ -83,16 +83,16 @@ def _create_yaml_dict(
     level_name = level_folder.stem
     m = re.fullmatch(r"R\d\d\d\d\d\d_?(.*)?", level_name)
     if m:
-        level_dict["datatype"] = "recording"
+        level_dict["type"] = "recording"
         level_dict["protocol"] = (
             m[1] if m[1] is not None else "XXERRORXX PROTOCOL NOT SPECIFIED"
         )
         level_dict["recording_type"] = "XXERRORXX error RECORDING TYPE NOT SPECIFIED"
 
     elif re.fullmatch(r"S\d*", level_name):
-        level_dict["datatype"] = "session"
+        level_dict["type"] = "session"
     else:
-        level_dict["datatype"] = "sample"
+        level_dict["type"] = "sample"
     level_dict["genealogy"] = genealogy + [level_name]
     level_dict["path"] = Path(project, *level_dict["genealogy"])
     if format_yaml:
@@ -185,7 +185,7 @@ def _upload_yaml_dict(
 ):
     for entity, entity_data in yaml_dict.items():
         children = entity_data.pop("children", {})
-        datatype = entity_data.pop("datatype")
+        datatype = entity_data.pop("type")
         if datatype == "session":
             if verbose:
                 print(f"Adding session `{entity}`")

From f9ecaaf551b3991487ae3fcc6f19e22ad694c63d Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Sun, 28 May 2023 09:16:58 +0100
Subject: [PATCH 04/73] create_yaml uses origin_name instead of  genealogy

---
 flexiznam/camp/sync_data.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/flexiznam/camp/sync_data.py b/flexiznam/camp/sync_data.py
index abcf05a..699f39a 100644
--- a/flexiznam/camp/sync_data.py
+++ b/flexiznam/camp/sync_data.py
@@ -18,7 +18,7 @@
 def create_yaml_dict(
     root_folder,
     project,
-    genealogy,
+    origin_name,
     format_yaml=True,
 ):
     """Create a yaml dict from a folder
@@ -28,9 +28,8 @@ def create_yaml_dict(
     Args:
         root_folder (str): Path to the folder to parse
         project (str): Name of the project, used as root of the path in the output
-        genealogy (list): List of strings with the genealogy of root_folder. If
-            root_folder is a recording for instance, genealogy should be (mouse,
-            session).
+        origin_name (str): Name of the origin on flexilims. Must be online and have 
+            genealogy set.
         format_yaml (bool, optional): Format the output to be yaml compatible if True,
             otherwise keep dataset as Dataset object and path as pathlib.Path. Defaults
             to True.
@@ -39,8 +38,12 @@ def create_yaml_dict(
         dict: Dictionary with the structure of the folder and automatically detected
             datasets
     """
-    if isinstance(genealogy, str):
-        genealogy = [genealogy]
+    flm_sess = flz.Session(project=project)
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        origin = flm_sess.get_origin(origin_name)
+    genealogy = origin.genealogy
+    
     data = _create_yaml_dict(
         level_folder=root_folder,
         project=project,

From 13dd5867738d27fc30393a61b6fe2a463ab96d83 Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Sun, 28 May 2023 09:17:52 +0100
Subject: [PATCH 05/73] Remove from yaml flexilims only fields

These would be erased by info from other source when uploading
---
 flexiznam/camp/sync_data.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/flexiznam/camp/sync_data.py b/flexiznam/camp/sync_data.py
index 699f39a..1c2448b 100644
--- a/flexiznam/camp/sync_data.py
+++ b/flexiznam/camp/sync_data.py
@@ -110,6 +110,9 @@ def _create_yaml_dict(
                 proot = str(level_folder)[: -len(level_dict["path"])]
                 ds.path = ds.path.relative_to(proot)
                 children[ds_name] = ds.format(mode="yaml")
+                # remove fields that are not needed
+                for field in ["origin_id", "project_id", "name"]:
+                    children[ds_name].pop(field, None)
                 children[ds_name]["path"] = str(
                     PurePosixPath(children[ds_name]["path"])
                 )

From 0ef9f6c991d41aada4541accd320ad2de05bd43a Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Sun, 28 May 2023 09:18:48 +0100
Subject: [PATCH 06/73] Add skeleton of check_yaml_validity

---
 flexiznam/camp/sync_data.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/flexiznam/camp/sync_data.py b/flexiznam/camp/sync_data.py
index 1c2448b..908b6a4 100644
--- a/flexiznam/camp/sync_data.py
+++ b/flexiznam/camp/sync_data.py
@@ -28,7 +28,7 @@ def create_yaml_dict(
     Args:
         root_folder (str): Path to the folder to parse
         project (str): Name of the project, used as root of the path in the output
-        origin_name (str): Name of the origin on flexilims. Must be online and have 
+        origin_name (str): Name of the origin on flexilims. Must be online and have
             genealogy set.
         format_yaml (bool, optional): Format the output to be yaml compatible if True,
             otherwise keep dataset as Dataset object and path as pathlib.Path. Defaults
@@ -43,7 +43,7 @@ def create_yaml_dict(
         warnings.simplefilter("ignore")
         origin = flm_sess.get_origin(origin_name)
     genealogy = origin.genealogy
-    
+
     data = _create_yaml_dict(
         level_folder=root_folder,
         project=project,
@@ -262,6 +262,18 @@ def _upload_yaml_dict(
         )
 
 
+def check_yaml_validity(yaml, root_folder, origin_name):
+    if isinstance(yaml, str):
+        with open(yaml, "r") as f:
+            yaml = yaml.safe_load(f)
+    assert yaml["root_folder"] == root_folder, f"root_folder should be {root_folder}"
+    _check_recursively(yaml["children"], root_folder, origin_name)
+
+
+def _check_recursively(yaml, root_folder):
+    raise NotImplementedError
+
+
 if __name__ == "__main__":
     data = create_yaml_dict(
         "/Volumes/lab-znamenskiyp/data/instruments/raw_data/projects/blota_onix_pilote/BRAC7448.2d/S20230412",

From 078c5c0ef38fbc1d4be573ee92acac1edcd508bb Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Sun, 28 May 2023 17:55:05 +0100
Subject: [PATCH 07/73] GUI can upload to flexilims

---
 flexiznam/gui/flexigui.py | 112 ++++++++++++++++++++++++++------------
 1 file changed, 78 insertions(+), 34 deletions(-)

diff --git a/flexiznam/gui/flexigui.py b/flexiznam/gui/flexigui.py
index eea1fd2..f4ddf5f 100644
--- a/flexiznam/gui/flexigui.py
+++ b/flexiznam/gui/flexigui.py
@@ -11,6 +11,7 @@
 class FlexiGui(tk.Tk):
 
     FLEXILIMS_ONLY_FIELDS = ("children", "project", "origin_id")
+    RESOURCES = Path(__file__).parent
 
     def __init__(self):
         super().__init__()
@@ -27,6 +28,8 @@ def __init__(self):
         self._setup_widgets()
         self._entity_by_itemid = {}
 
+        self.data = {}
+
     def _setup_widgets(self):
         self._create_frames()
         self._create_buttons()
@@ -89,25 +92,26 @@ def _create_textview(self):
         )
         self.update_item_btn.grid(row=2, column=1, sticky="nsw")
 
+    def _check_options_are_set(self, options=("project", "origin_name")):
+        init_values = dict(project="SELECT", origin_name="ENTER")
+        for option in options:
+            value = getattr(self, option).get()
+            if value.startswith(init_values[option]):
+                tk.messagebox.showerror("Error", f"Error: enter {option} first!")
+                return False
+        return True
+
     def parse_folder(self):
-        genealogy = self.genealogy.get()
-        if genealogy.startswith("ENTER COMMA"):
-            tk.messagebox.showerror("Error", "Error: enter genealogy first!")
-            return
-        project = self.project.get()
-        if project == "SELECT PROJECT":
-            tk.messagebox.showerror("Error", "Error: select project first!")
+        if not self._check_options_are_set():
             return
-        genealogy = [g.strip() for g in genealogy.split(",")]
-        self.root_folder.set(
-            tk.filedialog.askdirectory(
-                initialdir=self.root_folder.get(), title="Select directory to parse"
-            )
+        folder = tk.filedialog.askdirectory(
+            initialdir=self.root_folder.get(), title="Select directory to parse"
         )
+        self.root_folder.set(folder)
         data = flz.camp.sync_data.create_yaml_dict(
-            root_folder=self.root_folder.get(),
-            project=project,
-            genealogy=genealogy,
+            root_folder=folder,
+            project=self.project.get(),
+            origin_name=self.origin_name.get(),
             format_yaml=True,
         )
         self.data = data
@@ -115,11 +119,11 @@ def parse_folder(self):
 
     def _create_buttons(self):
         topf = self.frames["t"]
-        self.parse_btn = tk.Button(topf, text="Parse folder", command=self.parse_folder)
+        self.parse_btn = tk.Button(topf, text="Parse", command=self.parse_folder)
         self.parse_btn.grid(row=0, column=0, sticky="w")
-        self.load_btn = tk.Button(topf, text="Load yaml", command=self.load_yaml)
+        self.load_btn = tk.Button(topf, text="Load", command=self.load_yaml)
         self.load_btn.grid(row=0, column=1, sticky="w")
-        self.write_btn = tk.Button(topf, text="Write yaml", command=self.write_yaml)
+        self.write_btn = tk.Button(topf, text="Write", command=self.write_yaml)
         self.write_btn.grid(row=0, column=2)
 
         # add project dropdown and label
@@ -132,23 +136,33 @@ def _create_buttons(self):
             "SELECT PROJECT",
             *flz.PARAMETERS["project_ids"].keys(),
         ).grid(row=0, column=4, columnspan=3, sticky="w")
-        self.upload_btn = tk.Button(topf, text="Upload to flexilims")
+        fllogo = tk.PhotoImage(file=str(self.RESOURCES / "flexilims_logo.png"))
+        fllogo = fllogo.subsample(10, 10)
+        self.upload_btn = tk.Button(topf, text="Upload", command=self.upload)
         self.upload_btn.grid(row=0, column=7)
 
+        # add conflicts dropdown and label
+        tk.Label(topf, text="Conflicts:").grid(row=0, column=8, sticky="w")
+        self.conflicts = tk.StringVar(self)
+        self.conflicts.set("abort")
+        self.conflicts_ddwn = tk.OptionMenu(
+            topf, self.conflicts, "abort", "overwrite", "skip"
+        )
+        self.conflicts_ddwn.grid(row=0, column=9, sticky="w")
         self.quit_btn = tk.Button(topf, text="Quit", command=self.quit)
         self.quit_btn.grid(row=0, column=10, sticky="e")
 
-        # add genealogy and root dir
-        tk.Label(topf, text="Genealogy:").grid(row=1, column=0, sticky="w")
-        self.genealogy = tk.StringVar(self)
-        self.genealogy.set("ENTER COMMA SEPARATED GENEALOGY")
-        self.genealogy_entry = tk.Entry(topf, textvariable=self.genealogy)
-        self.genealogy_entry.grid(row=1, column=1, columnspan=3, sticky="nsew")
-        tk.Label(topf, text="Root directory:").grid(row=1, column=4, sticky="w")
+        # add origin name and root dir
+        tk.Label(topf, text="Origin name:").grid(row=1, column=0, sticky="w")
+        self.origin_name = tk.StringVar(self)
+        self.origin_name.set("ENTER FLEXILIMS ORIGIN NAME")
+        self.origin_name_entry = tk.Entry(topf, textvariable=self.origin_name)
+        self.origin_name_entry.grid(row=1, column=1, columnspan=2, sticky="nsew")
+        tk.Label(topf, text="Root directory:").grid(row=1, column=3, sticky="w")
         self.root_folder = tk.StringVar(self)
         self.root_folder.set(os.getcwd())
         self.root_folder_entry = tk.Entry(topf, textvariable=self.root_folder)
-        self.root_folder_entry.grid(row=1, column=5, columnspan=5, sticky="nsew")
+        self.root_folder_entry.grid(row=1, column=4, columnspan=6, sticky="nsew")
         self.chg_dir_btn = tk.Button(topf, text="...", command=self.chg_root_folder)
         self.chg_dir_btn.grid(row=1, column=10)
 
@@ -175,12 +189,14 @@ def load_yaml(self):
         print("Select YAML file to load")
         filetypes = (("Yaml files", "*.yml *.yaml"), ("All files", "*.*"))
 
-        self.filename = tk.filedialog.askopenfilename(
+        filename = tk.filedialog.askopenfilename(
             title="Select YAML file to load", filetypes=filetypes
         )
-        with open(self.filename, "r") as f:
+        if not filename:
+            return
+        with open(filename, "r") as f:
             self.data = yaml.safe_load(f)
-        print('Loaded YAML file "{}"'.format(self.filename))
+        print('Loaded YAML file "{}"'.format(filename))
         self.update_data()
 
     def update_data(self, name_to_select=None):
@@ -193,6 +209,12 @@ def update_data(self, name_to_select=None):
         self.selected_item.set("None")
         self.treeview.delete(*self.treeview.get_children())
         self._entity_by_itemid = {}
+        if "project" in self.data:
+            self.project.set(self.data["project"])
+        if "origin_name" in self.data:
+            self.origin_name.set(self.data["origin_name"])
+        if "root_folder" in self.data:
+            self.root_folder.set(self.data["root_folder"])
         self._insert_yaml_data(self.data["children"], name_to_select=name_to_select)
 
     def _insert_yaml_data(self, data, parent="", name_to_select=None):
@@ -207,6 +229,7 @@ def _insert_yaml_data(self, data, parent="", name_to_select=None):
                 values=[dtype],
                 open=True,
             )
+            self.treeview.change_state(item, "checked")
             if any(
                 [
                     v.startswith("XXERRORXX")
@@ -240,6 +263,31 @@ def write_yaml(self):
             yaml.dump(data, f)
         print('Wrote YAML file "{}"'.format(target))
 
+    def upload(self):
+        """Upload data to flexilims"""
+        print("Uploading data to flexilims")
+        if not self._check_options_are_set():
+            return
+
+        data = dict(self.data)
+        if not data:
+            tk.messagebox.showerror("Error", "No data loaded")
+            return
+        data["project"] = self.project.get()
+        data["root_folder"] = self.root_folder.get()
+        if data["project"].startswith("XXERRORXX"):
+            print("Project name not set")
+            return
+        flz.camp.sync_data.upload_yaml(
+            source_yaml=data,
+            raw_data_folder=data["root_folder"],
+            verbose=True,
+            log_func=print,
+            flexilims_session=None,
+            conflicts=self.conflicts.get(),
+        )
+        print("Done")
+
     def update_item(self):
         """Update the selected item with the textview contents"""
         text = self.textview.get(1.0, tk.END)
@@ -270,8 +318,4 @@ def update_item(self):
 
 if __name__ == "__main__":
     app = FlexiGui()
-    with open("test.yml", "r") as f:
-        data = yaml.safe_load(f)
-    app.data = data
-    app.update_data()
     app.mainloop()

From b0312b0fe874b084cd3dd6f8b470f0040d977eaf Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Sun, 28 May 2023 17:56:39 +0100
Subject: [PATCH 08/73] bugfix & switch to origin_name

specifiying genealogy is a bit cumbersome and the entity must exist
online to upload anyway.
Just ask for the name and get genealogy fromn there
---
 flexiznam/camp/sync_data.py | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/flexiznam/camp/sync_data.py b/flexiznam/camp/sync_data.py
index 908b6a4..bf2710a 100644
--- a/flexiznam/camp/sync_data.py
+++ b/flexiznam/camp/sync_data.py
@@ -38,11 +38,13 @@ def create_yaml_dict(
         dict: Dictionary with the structure of the folder and automatically detected
             datasets
     """
-    flm_sess = flz.Session(project=project)
+    flm_sess = flz.get_flexilims_session(project_id=project)
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")
-        origin = flm_sess.get_origin(origin_name)
-    genealogy = origin.genealogy
+        origin = flz.get_entity(name=origin_name, flexilims_session=flm_sess)
+    assert origin is not None, f"Origin {origin_name} not found in project {project}"
+    assert "genealogy" in origin, f"Origin {origin_name} has no genealogy"
+    genealogy = origin["genealogy"]
 
     data = _create_yaml_dict(
         level_folder=root_folder,
@@ -51,7 +53,7 @@ def create_yaml_dict(
         format_yaml=format_yaml,
         parent_dict=dict(),
     )
-    out = dict(root_folder=root_folder, root_genealogy=genealogy, children=data)
+    out = dict(root_folder=root_folder, origin_name=origin_name, children=data)
     return out
 
 
@@ -144,7 +146,7 @@ def upload_yaml(
     """Upload data from one yaml to flexilims
 
     Args:
-        source_yaml (str): path to clean yaml
+        source_yaml (dict or str): path to clean yaml or yaml dict
         raw_data_folder (str): path to the folder containing the data. Default to
             data_root['raw']
         verbose (bool): print progress information
@@ -159,15 +161,20 @@ def upload_yaml(
         list of names of entities created/updated
 
     """
-    with open(source_yaml, "r") as f:
-        yaml_data = yaml.safe_load(f)
+    if isinstance(source_yaml, str):
+        source_yaml = Path(source_yaml)
+        with open(source_yaml, "r") as f:
+            yaml_data = yaml.safe_load(f)
+    else:
+        assert isinstance(source_yaml, dict), "source_yaml must be a dict or a path"
+        yaml_data = source_yaml
 
     # first find the origin
 
     if flexilims_session is None:
         flexilims_session = flz.get_flexilims_session(project_id=yaml_data["project"])
 
-    origin_name = "_".join(yaml_data["root_genealogy"])
+    origin_name = yaml_data["origin_name"]
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")
         origin = flz.get_entity(name=origin_name, flexilims_session=flexilims_session)
@@ -243,12 +250,11 @@ def _upload_yaml_dict(
                 path=path,
                 genealogy=genealogy,
                 is_raw="yes",
-                project_id=None,
-                flexilims_session=None,
-                dataset_name=None,
-                attributes=None,
+                flexilims_session=flexilims_session,
+                dataset_name=entity,
+                attributes=entity_data['extra_attributes'],
                 strict_validation=False,
-                conflicts="append",
+                conflicts=conflicts,
             )
 
         _upload_yaml_dict(

From 94c9be9bc41b2d9d4d8f35ce73a73adc19543351 Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Sun, 28 May 2023 21:14:22 +0100
Subject: [PATCH 09/73] [minor] black and remove __main__ part

That was for debuging
---
 flexiznam/camp/sync_data.py | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/flexiznam/camp/sync_data.py b/flexiznam/camp/sync_data.py
index bf2710a..c48ef6a 100644
--- a/flexiznam/camp/sync_data.py
+++ b/flexiznam/camp/sync_data.py
@@ -252,7 +252,7 @@ def _upload_yaml_dict(
                 is_raw="yes",
                 flexilims_session=flexilims_session,
                 dataset_name=entity,
-                attributes=entity_data['extra_attributes'],
+                attributes=entity_data["extra_attributes"],
                 strict_validation=False,
                 conflicts=conflicts,
             )
@@ -278,16 +278,3 @@ def check_yaml_validity(yaml, root_folder, origin_name):
 
 def _check_recursively(yaml, root_folder):
     raise NotImplementedError
-
-
-if __name__ == "__main__":
-    data = create_yaml_dict(
-        "/Volumes/lab-znamenskiyp/data/instruments/raw_data/projects/blota_onix_pilote/BRAC7448.2d/S20230412",
-        project="blota_onix_pilote",
-        genealogy="BRAC7448.2d",
-    )
-    with open("test.yml", "w") as writer:
-        yaml.safe_dump(data, writer)
-    print("done")
-    flm_sess = flz.get_flexilims_session(project_id="blota_onix_pilote")
-    upload_yaml("test.yml", conflicts="overwrite", flexilims_session=flm_sess)

From 055df0a7087d41638729bc0d9128afadf531bbd5 Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Sun, 28 May 2023 23:18:34 +0100
Subject: [PATCH 10/73] change minor version number

Because of large change
---
 CHANGELOG.md | 2 +-
 setup.py     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0e6ca1a..d1eba92 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,6 @@
 # Change log
 
-## v0.3.5
+## v0.4.0
 
 ### Main changes
 - `flz.get_datasets` can return `Dataset` objects instead of path strings if 
diff --git a/setup.py b/setup.py
index 73edfee..e0cb181 100755
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name="flexiznam",
-    version="v0.3.5",
+    version="v0.4.0",
     url="https://github.com/znamlab/flexznam",
     license="MIT",
     author="Antonin Blot",

From 4634006a4fc5669c2c74c1b76707c23e6b128a8c Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Thu, 1 Jun 2023 12:07:56 +0100
Subject: [PATCH 11/73] [bugfix] add genealogy from parent

add_dataset does not require the genealogy
---
 flexiznam/camp/sync_data.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/flexiznam/camp/sync_data.py b/flexiznam/camp/sync_data.py
index c48ef6a..86d2b93 100644
--- a/flexiznam/camp/sync_data.py
+++ b/flexiznam/camp/sync_data.py
@@ -197,6 +197,7 @@ def _upload_yaml_dict(
     yaml_dict, origin, raw_data_folder, log_func, flexilims_session, conflicts, verbose
 ):
     for entity, entity_data in yaml_dict.items():
+        entity_data = entity_data.copy()
         children = entity_data.pop("children", {})
         datatype = entity_data.pop("type")
         if datatype == "session":
@@ -240,7 +241,8 @@ def _upload_yaml_dict(
             created = entity_data.pop("created")
             dataset_type = entity_data.pop("dataset_type")
             path = entity_data.pop("path")
-            genealogy = entity_data.pop("genealogy")
+            is_raw = entity_data.pop("is_raw")
+
             if verbose:
                 print(f"Adding dataset `{entity}`, type `{dataset_type}`")
             new_entity = flz.add_dataset(
@@ -248,8 +250,7 @@ def _upload_yaml_dict(
                 dataset_type=dataset_type,
                 created=created,
                 path=path,
-                genealogy=genealogy,
-                is_raw="yes",
+                is_raw=is_raw,
                 flexilims_session=flexilims_session,
                 dataset_name=entity,
                 attributes=entity_data["extra_attributes"],

From b50f755fedf4ea9346dd01e21ff1381b8eff9b11 Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Thu, 1 Jun 2023 12:10:11 +0100
Subject: [PATCH 12/73] [bugfix] add with full names

add_recording and add_sample
---
 CHANGELOG.md      |  5 ++++-
 flexiznam/main.py | 53 ++++++++++++++++++++++++++---------------------
 2 files changed, 33 insertions(+), 25 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d1eba92..a419183 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,7 +16,8 @@
 - `Dataset.format(mode='yaml')` ensure yaml compatibility. (path to str, tuple to list, 
   etc...)
 - `add_experimental_session` can be done with `parent_id` (or `parent_name`).
-- `add_dataset` can add a dataset to a mouse.
+- `add_dataset` can add a dataset to a mouse and does not require genealogy.
+
 
 ### Bugfixes
 - Fix [#68](https://github.com/znamlab/flexiznam/issues/68). Dataset.format returns 
@@ -24,6 +25,8 @@
 - Fix [#88](https://github.com/znamlab/flexiznam/issues/88). Now make attributes JSON
   compatible before uploading to flexilims. This will replace special characters in
   attribute names by `_` in the database.
+- `add_recording` and `add_sample` add the value online with the full name (including
+  genealogy) rather than the short name.
 
 ## v0.3.4
 
diff --git a/flexiznam/main.py b/flexiznam/main.py
index 46b8fe9..a8436fc 100755
--- a/flexiznam/main.py
+++ b/flexiznam/main.py
@@ -321,17 +321,16 @@ def add_recording(
             "conflicts must be `skip`, `abort`, `overwrite` or `update`"
         )
 
-    experimental_session = get_entity(
-        datatype="session", flexilims_session=flexilims_session, id=session_id
-    )
+    parent_series = get_entity(flexilims_session=flexilims_session, id=session_id)
     recording_info = {"recording_type": recording_type, "protocol": protocol}
+
     if attributes is None:
         attributes = {}
     if "path" not in attributes:
         attributes["path"] = str(
             Path(
                 get_path(
-                    experimental_session["path"],
+                    parent_series["path"],
                     datatype="session",
                     flexilims_session=flexilims_session,
                 )
@@ -347,20 +346,25 @@ def add_recording(
     recording_info.update(attributes)
 
     if recording_name is None:
-        recording_name = experimental_session["name"] + "_" + protocol + "_0"
+        recording_name = parent_series["name"] + "_" + protocol + "_0"
+
+    if "genealogy" not in attributes:
+        attributes["genealogy"] = list(parent_series["genealogy"]) + [recording_name]
+    rec_full_name = "_".join(attributes["genealogy"])
+
     online_recording = get_entity(
-        datatype="recording", name=recording_name, flexilims_session=flexilims_session
+        datatype="recording", name=rec_full_name, flexilims_session=flexilims_session
     )
     if online_recording is not None:
         if conflicts.lower() == "skip":
-            print("A recording named %s already exists" % (recording_name))
+            print("A recording named %s already exists" % (rec_full_name))
             return online_recording
         elif conflicts.lower() == "abort":
-            raise FlexilimsError("A recording named %s already exists" % recording_name)
+            raise FlexilimsError("A recording named %s already exists" % rec_full_name)
         else:
             resp = update_entity(
                 datatype="recording",
-                name=recording_name,
+                name=rec_full_name,
                 id=online_recording["id"],
                 origin_id=session_id,
                 mode=conflicts,
@@ -372,7 +376,7 @@ def add_recording(
 
     resp = flexilims_session.post(
         datatype="recording",
-        name=recording_name,
+        name=rec_full_name,
         attributes=recording_info,
         origin_id=session_id,
         other_relations=other_relations,
@@ -529,7 +533,6 @@ def add_dataset(
     dataset_type,
     created,
     path,
-    genealogy,
     is_raw="yes",
     project_id=None,
     flexilims_session=None,
@@ -545,8 +548,6 @@ def add_dataset(
         dataset_type (str): dataset_type, must be a type define in the config file
         created (str): date of creation as text, usually in this format: '2021-05-24 14:56:41'
         path (str): path to the data relative to the project folder
-        genealogy (tuple): parents of this dataset from the project (excluded) down to
-                           the dataset name itself (included)
         is_raw (str): `yes` or `no`, used to find the root directory
         project_id (str): hexadecimal ID or name of the project
         flexilims_session (:py:class:`flexilims.Flexilims`): authentication
@@ -572,11 +573,10 @@ def add_dataset(
     if conflicts.lower() not in valid_conflicts:
         raise AttributeError("`conflicts` must be in [%s]" % ", ".join(valid_conflicts))
 
+    parent = get_entity(flexilims_session=flexilims_session, id=parent_id)
+
     if dataset_name is None:
-        parent_name = get_entity(
-            flexilims_session=flexilims_session,
-            id=parent_id,
-        )["name"]
+        parent_name = parent["name"]
         dataset_name = parent_name + "_" + dataset_type + "_0"
 
     dataset_info = {
@@ -584,7 +584,7 @@ def add_dataset(
         "created": created,
         "path": path,
         "is_raw": is_raw,
-        "genealogy": genealogy,
+        "genealogy": list(parent["genealogy"]),
     }
     reserved_attributes = ["dataset_type", "created", "path", "is_raw", "genealogy"]
     if attributes is not None:
@@ -596,32 +596,37 @@ def add_dataset(
         dataset_name = generate_name(
             "dataset", dataset_name, flexilims_session=flexilims_session
         )
+        dataset_info["genealogy"].append(dataset_name)
+        dataset_full_name = "_".join(dataset_info["genealogy"])
     else:
+        dataset_info["genealogy"].append(dataset_name)
+        dataset_full_name = "_".join(dataset_info["genealogy"])
         online_version = get_entity(
-            "dataset", name=dataset_name, flexilims_session=flexilims_session
+            "dataset", name=dataset_full_name, flexilims_session=flexilims_session
         )
         if online_version is not None:
             if conflicts.lower() == "abort":
-                raise FlexilimsError("A dataset named %s already exists" % dataset_name)
+                raise FlexilimsError(
+                    "A dataset named %s already exists" % dataset_full_name
+                )
             elif conflicts.lower() == "skip":
-                print("A dataset named %s already exists" % dataset_name)
+                print("A dataset named %s already exists" % dataset_full_name)
                 return online_version
             else:
                 resp = update_entity(
                     datatype="dataset",
-                    name=dataset_name,
+                    name=dataset_full_name,
                     id=online_version["id"],
                     origin_id=parent_id,
                     mode=conflicts,
                     attributes=dataset_info,
-                    other_relations=None,
                     flexilims_session=flexilims_session,
                 )
                 return resp
 
     resp = flexilims_session.post(
         datatype="dataset",
-        name=dataset_name,
+        name=dataset_full_name,
         origin_id=parent_id,
         attributes=dataset_info,
         strict_validation=strict_validation,

From da64162e60b73970242fd1e0f36ed4368a704831 Mon Sep 17 00:00:00 2001
From: BenitaTB <62141042+BenitaTB@users.noreply.github.com>
Date: Thu, 1 Jun 2023 14:35:13 +0100
Subject: [PATCH 13/73] add dependancy for ttkwidgets

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index e0cb181..b0e2ebe 100755
--- a/setup.py
+++ b/setup.py
@@ -18,6 +18,7 @@
         "flexilims @ git+ssh://git@github.com/znamlab/flexilims.git#egg=flexilims",
         "pymcms @ git+ssh://git@github.com/znamlab/pymcms.git#egg=pymcms",
         "tifffile",
+        "ttkwidgets",
     ],
     entry_points="""
         [console_scripts]

From 99d3e404f9c9f578cf8fd4c84b59c4cbe7cffeac Mon Sep 17 00:00:00 2001
From: BenitaTB <62141042+BenitaTB@users.noreply.github.com>
Date: Thu, 1 Jun 2023 14:35:33 +0100
Subject: [PATCH 14/73] bug fix removed logo.png

---
 flexiznam/gui/flexigui.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/flexiznam/gui/flexigui.py b/flexiznam/gui/flexigui.py
index f4ddf5f..a3f4aa1 100644
--- a/flexiznam/gui/flexigui.py
+++ b/flexiznam/gui/flexigui.py
@@ -9,7 +9,6 @@
 
 
 class FlexiGui(tk.Tk):
-
     FLEXILIMS_ONLY_FIELDS = ("children", "project", "origin_id")
     RESOURCES = Path(__file__).parent
 
@@ -73,7 +72,6 @@ def _create_treeview(self):
         self.treeview.tag_configure("error", background="red")
 
     def _create_textview(self):
-
         # Create the Text widget
         tk.Label(self.frames["br"], text="Selected item:").grid(
             row=0,
@@ -136,8 +134,6 @@ def _create_buttons(self):
             "SELECT PROJECT",
             *flz.PARAMETERS["project_ids"].keys(),
         ).grid(row=0, column=4, columnspan=3, sticky="w")
-        fllogo = tk.PhotoImage(file=str(self.RESOURCES / "flexilims_logo.png"))
-        fllogo = fllogo.subsample(10, 10)
         self.upload_btn = tk.Button(topf, text="Upload", command=self.upload)
         self.upload_btn.grid(row=0, column=7)
 

From 8efd593e0b1660acc115d02706e168f9ee7aa588 Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Thu, 1 Jun 2023 17:08:23 +0100
Subject: [PATCH 15/73] [feature] dataset validity

Each dataset subclass has a is_valid method checking if the expected
files actually exist
---
 flexiznam/schema/camera_data.py     | 18 +++++++++--------
 flexiznam/schema/datasets.py        | 11 ++++++----
 flexiznam/schema/harp_data.py       | 21 ++++++++++++-------
 flexiznam/schema/microscopy_data.py |  6 ------
 flexiznam/schema/onix_data.py       | 31 ++++++++++++++++++++++++++++-
 flexiznam/schema/scanimage_data.py  | 12 ++++++-----
 6 files changed, 68 insertions(+), 31 deletions(-)

diff --git a/flexiznam/schema/camera_data.py b/flexiznam/schema/camera_data.py
index 30d7631..98a2bc7 100644
--- a/flexiznam/schema/camera_data.py
+++ b/flexiznam/schema/camera_data.py
@@ -219,12 +219,14 @@ def video_file(self):
     def video_file(self, value):
         self.extra_attributes["video_file"] = str(value)
 
-    def is_valid(self):
+    def is_valid(self, return_reason=False):
         """Check that video, metadata and timestamps files exist"""
-        if not (pathlib.Path(self.path) / self.timestamp_file).exists():
-            return False
-        if not (pathlib.Path(self.path) / self.metadata_file).exists():
-            return False
-        if not (pathlib.Path(self.path) / self.video_file).exists():
-            return False
-        return True
+        for attr in ["video_file", "timestamp_file", "metadata_file"]:
+            if attr not in self.extra_attributes:
+                msg = f"Missing attribute {attr}"
+                return msg if return_reason else False
+            fname = getattr(self, attr)
+            if not (self.path_full / fname).exists():
+                msg = f"Unvalid {attr}. {self.path_full / fname} does not exist"
+                return msg if return_reason else False
+        return "" if return_reason else True
diff --git a/flexiznam/schema/datasets.py b/flexiznam/schema/datasets.py
index 1121c1c..c6c91d7 100644
--- a/flexiznam/schema/datasets.py
+++ b/flexiznam/schema/datasets.py
@@ -325,13 +325,16 @@ def __init__(
         elif project_id is not None:
             self.project_id = project_id
 
-    def is_valid(self):
-        """
-        Dummy method definition. Should be reimplemented in children classes
+    def is_valid(self, return_reason=False):
+        """Check if the file path is valid for this dataset
 
+        Should be reimplemented in children classes.
         Should return True if the dataset is found a valid, false otherwise
         """
-        raise NotImplementedError("`is_valid` is not defined for generic datasets")
+        if not self.path_full.exists():
+            msg = f"Path {self.path_full} does not exist"
+            return msg if return_reason else False
+        return "" if return_reason else True
 
     def associated_files(self, folder=None):
         """Give a list of all files associated with this dataset
diff --git a/flexiznam/schema/harp_data.py b/flexiznam/schema/harp_data.py
index 3613cd1..e105f00 100644
--- a/flexiznam/schema/harp_data.py
+++ b/flexiznam/schema/harp_data.py
@@ -163,11 +163,18 @@ def csv_files(self):
     def csv_files(self, value):
         self.extra_attributes["csv_files"] = str(value)
 
-    def is_valid(self):
-        """Check that video, metadata and timestamps files exist"""
-        if not (pathlib.Path(self.path) / self.binary_file).exists():
-            return False
+    def is_valid(self, return_reason=False):
+        """Check that video, metadata and timestamps files exist
+
+        Args:
+            return_reason (bool): if True, return a string with the reason why the
+                                  dataset is not valid
+        Returns:"""
+        if not (self.path_full / self.binary_file).exists():
+            msg = f"Missing file {self.binary_file}"
+            return msg if return_reason else False
         for _, file_path in self.csv_files.items():
-            if not (pathlib.Path(self.path) / file_path).exists():
-                return False
-        return True
+            if not (self.path_full / file_path).exists():
+                msg = f"Missing file {file_path}"
+                return msg if return_reason else False
+        return "" if return_reason else True
diff --git a/flexiznam/schema/microscopy_data.py b/flexiznam/schema/microscopy_data.py
index dc0c44b..eda32cb 100644
--- a/flexiznam/schema/microscopy_data.py
+++ b/flexiznam/schema/microscopy_data.py
@@ -139,9 +139,3 @@ def __init__(
             id=id,
             flexilims_session=flexilims_session,
         )
-
-    def is_valid(self):
-        """Check that the file exist"""
-        if not (pathlib.Path(self.path)).exists():
-            return False
-        return True
diff --git a/flexiznam/schema/onix_data.py b/flexiznam/schema/onix_data.py
index 57c59b6..cc09ac0 100644
--- a/flexiznam/schema/onix_data.py
+++ b/flexiznam/schema/onix_data.py
@@ -94,7 +94,9 @@ def from_folder(
             onix_name = "onix_data_%s" % ts.strftime("%Y-%m-%d_%H_%M_%S")
             extra_attributes = dict()
             for device, dev_df in df.groupby("device_name"):
-                extra_attributes[device] = {s.subname: s.file for s in dev_df.itertuples()}
+                extra_attributes[device] = {
+                    s.subname: s.file for s in dev_df.itertuples()
+                }
             output[onix_name] = OnixData(
                 path=folder,
                 genealogy=folder_genealogy + (onix_name,),
@@ -150,3 +152,30 @@ def __init__(
             id=id,
             flexilims_session=flexilims_session,
         )
+
+    def is_valid(self, return_reason=False):
+        """Check that the onix dataset is valid
+
+        Args:
+            return_reason (bool): if True, return a string with the reason why the
+                dataset is not valid. If False, return True or False
+
+        Returns:
+            bool or str: True if valid, False if not. If return_reason is True, return
+                a string with the reason why the dataset is not valid."""
+
+        ndevices = 0
+        for device_name in OnixData.DEVICE_NAMES:
+            if device_name not in self.extra_attributes:
+                continue
+            ndevices += 1
+            dev_dict = self.extra_attributes[device_name]
+            for v in dev_dict.values():
+                p = self.path_full / v
+                if not p.exists():
+                    msg = f"File {p} does not exist"
+                    return msg if return_reason else False
+        if ndevices == 0:
+            msg = "No devices found"
+            return msg if return_reason else False
+        return "" if return_reason else True
diff --git a/flexiznam/schema/scanimage_data.py b/flexiznam/schema/scanimage_data.py
index 681c48f..d299ebe 100644
--- a/flexiznam/schema/scanimage_data.py
+++ b/flexiznam/schema/scanimage_data.py
@@ -234,7 +234,7 @@ def tif_files(self, value):
             )
         self.extra_attributes["tif_files"] = value
 
-    def is_valid(self, tif_files=None):
+    def is_valid(self, return_reason=False, tif_files=None):
         """Check that associated files exist"""
         if tif_files is None:
             tif_files = self.tif_files
@@ -244,11 +244,13 @@ def is_valid(self, tif_files=None):
             f for f in os.listdir(self.path) if f.endswith(("tif", ".tiff"))
         }
         if tif_files - existing_file:
-            return False
+            msg = "Some tif files do not exist: %s" % (tif_files - existing_file)
+            return msg if return_reason else False
         for _, file_path in self.csv_files.items():
-            if not (pathlib.Path(self.path) / file_path).exists():
-                return False
-        return True
+            if not (self.path_full / file_path).exists():
+                msg = "Csv file does not exist: %s" % file_path
+                return msg if return_reason else False
+        return "" if return_reason else True
 
     def __len__(self):
         """Number of tif files in the dataset"""

From c58cb237c1ac0b9a4383fbc9836a4ba23622e6f6 Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Fri, 2 Jun 2023 11:20:21 +0100
Subject: [PATCH 16/73] [minor] Move upload_yaml up in the file

To keep private functions together at the bottom
---
 flexiznam/camp/sync_data.py | 121 ++++++++++++++++++------------------
 1 file changed, 62 insertions(+), 59 deletions(-)

diff --git a/flexiznam/camp/sync_data.py b/flexiznam/camp/sync_data.py
index 86d2b93..349ee61 100644
--- a/flexiznam/camp/sync_data.py
+++ b/flexiznam/camp/sync_data.py
@@ -45,6 +45,8 @@ def create_yaml_dict(
     assert origin is not None, f"Origin {origin_name} not found in project {project}"
     assert "genealogy" in origin, f"Origin {origin_name} has no genealogy"
     genealogy = origin["genealogy"]
+    root_folder = Path(root_folder)
+    assert root_folder.is_dir(), f"Folder {root_folder} does not exist"
 
     data = _create_yaml_dict(
         level_folder=root_folder,
@@ -53,10 +55,69 @@ def create_yaml_dict(
         format_yaml=format_yaml,
         parent_dict=dict(),
     )
-    out = dict(root_folder=root_folder, origin_name=origin_name, children=data)
+    out = dict(root_folder=root_folder.parent, origin_name=origin_name, children=data)
     return out
 
 
+
+def upload_yaml(
+    source_yaml,
+    raw_data_folder=None,
+    verbose=False,
+    log_func=print,
+    flexilims_session=None,
+    conflicts="abort",
+):
+    """Upload data from one yaml to flexilims
+
+    Args:
+        source_yaml (dict or str): path to clean yaml or yaml dict
+        raw_data_folder (str): path to the folder containing the data. Default to
+            data_root['raw']
+        verbose (bool): print progress information
+        log_func: function to deal with warnings and messages
+        flexilims_session (Flexilims): session to avoid recreating a token
+        conflicts (str): `abort` to crash if there is already a session or recording
+                         existing on flexilims, `skip` to ignore and proceed. Samples
+                         are always updated with `skip` and datasets always have
+                         mode=`safe`
+
+    Returns:
+        list of names of entities created/updated
+
+    """
+    if isinstance(source_yaml, str):
+        source_yaml = Path(source_yaml)
+        with open(source_yaml, "r") as f:
+            yaml_data = yaml.safe_load(f)
+    else:
+        assert isinstance(source_yaml, dict), "source_yaml must be a dict or a path"
+        yaml_data = source_yaml
+
+    # first find the origin
+
+    if flexilims_session is None:
+        flexilims_session = flz.get_flexilims_session(project_id=yaml_data["project"])
+
+    origin_name = yaml_data["origin_name"]
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        origin = flz.get_entity(name=origin_name, flexilims_session=flexilims_session)
+    assert origin is not None, f"`{origin_name}` not found on flexilims"
+    if verbose:
+        print(f"Found origin `{origin_name}` with id `{origin.id}`")
+    # then upload the data recursively
+    _upload_yaml_dict(
+        yaml_data["children"],
+        origin=origin,
+        raw_data_folder=raw_data_folder,
+        log_func=log_func,
+        flexilims_session=flexilims_session,
+        conflicts=conflicts,
+        verbose=verbose,
+    )
+
+
 def _create_yaml_dict(
     level_folder,
     project,
@@ -135,64 +196,6 @@ def _create_yaml_dict(
     return parent_dict
 
 
-def upload_yaml(
-    source_yaml,
-    raw_data_folder=None,
-    verbose=False,
-    log_func=print,
-    flexilims_session=None,
-    conflicts="abort",
-):
-    """Upload data from one yaml to flexilims
-
-    Args:
-        source_yaml (dict or str): path to clean yaml or yaml dict
-        raw_data_folder (str): path to the folder containing the data. Default to
-            data_root['raw']
-        verbose (bool): print progress information
-        log_func: function to deal with warnings and messages
-        flexilims_session (Flexilims): session to avoid recreating a token
-        conflicts (str): `abort` to crash if there is already a session or recording
-                         existing on flexilims, `skip` to ignore and proceed. Samples
-                         are always updated with `skip` and datasets always have
-                         mode=`safe`
-
-    Returns:
-        list of names of entities created/updated
-
-    """
-    if isinstance(source_yaml, str):
-        source_yaml = Path(source_yaml)
-        with open(source_yaml, "r") as f:
-            yaml_data = yaml.safe_load(f)
-    else:
-        assert isinstance(source_yaml, dict), "source_yaml must be a dict or a path"
-        yaml_data = source_yaml
-
-    # first find the origin
-
-    if flexilims_session is None:
-        flexilims_session = flz.get_flexilims_session(project_id=yaml_data["project"])
-
-    origin_name = yaml_data["origin_name"]
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore")
-        origin = flz.get_entity(name=origin_name, flexilims_session=flexilims_session)
-    assert origin is not None, f"`{origin_name}` not found on flexilims"
-    if verbose:
-        print(f"Found origin `{origin_name}` with id `{origin.id}`")
-    # then upload the data recursively
-    _upload_yaml_dict(
-        yaml_data["children"],
-        origin=origin,
-        raw_data_folder=raw_data_folder,
-        log_func=log_func,
-        flexilims_session=flexilims_session,
-        conflicts=conflicts,
-        verbose=verbose,
-    )
-
-
 def _upload_yaml_dict(
     yaml_dict, origin, raw_data_folder, log_func, flexilims_session, conflicts, verbose
 ):

From f3efb75751fd54d63087097fc4fe330d92381f93 Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Fri, 2 Jun 2023 11:20:58 +0100
Subject: [PATCH 17/73] [feature] add check_yaml_validity

Iteratively validates datasets
---
 flexiznam/camp/sync_data.py | 92 +++++++++++++++++++++++++++++++++----
 1 file changed, 84 insertions(+), 8 deletions(-)

diff --git a/flexiznam/camp/sync_data.py b/flexiznam/camp/sync_data.py
index 349ee61..6b106cd 100644
--- a/flexiznam/camp/sync_data.py
+++ b/flexiznam/camp/sync_data.py
@@ -5,6 +5,7 @@
 import re
 import copy
 import warnings
+import pandas as pd
 import yaml
 from yaml.parser import ParserError
 
@@ -59,6 +60,46 @@ def create_yaml_dict(
     return out
 
 
+def check_yaml_validity(yaml_data, root_folder=None, origin_name=None, project=None):
+    if isinstance(yaml_data, str) or isinstance(yaml_data, Path):
+        with open(yaml_data, "r") as f:
+            yaml_data = yaml.safe_load(f)
+    if root_folder is not None:
+        assert yaml_data["root_folder"] == str(
+            root_folder
+        ), f"root_folder is {yaml_data['root_folder']}. Expected {root_folder}"
+    else:
+        root_folder = yaml_data["root_folder"]
+
+    if project is not None:
+        assert (
+            yaml_data["project"] == project
+        ), f"project is {yaml_data['project']}. Expected {project}"
+    else:
+        project = yaml_data["project"]
+
+    if origin_name is not None:
+        assert (
+            yaml_data["origin_name"] == origin_name
+        ), f"origin_name is {yaml_data['origin_name']}. Expected {origin_name}"
+    else:
+        origin_name = yaml_data["origin_name"]
+
+    flm_sess = flz.get_flexilims_session(project_id=project)
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        origin = flz.get_entity(name=origin_name, flexilims_session=flm_sess)
+    assert hasattr(origin, "genealogy"), f"Origin {origin_name} has no genealogy"
+
+    _check_recursively(
+        yaml_data["children"],
+        origin_genealogy=origin["genealogy"],
+        root_folder=root_folder,
+        project=project,
+        genealogy=[],
+    )
+    return yaml_data
+
 
 def upload_yaml(
     source_yaml,
@@ -272,13 +313,48 @@ def _upload_yaml_dict(
         )
 
 
-def check_yaml_validity(yaml, root_folder, origin_name):
-    if isinstance(yaml, str):
-        with open(yaml, "r") as f:
-            yaml = yaml.safe_load(f)
-    assert yaml["root_folder"] == root_folder, f"root_folder should be {root_folder}"
-    _check_recursively(yaml["children"], root_folder, origin_name)
+def _check_recursively(
+    yaml_data, origin_genealogy, root_folder, project, genealogy, fixerrors=False
+):
+    root_folder = Path(root_folder)
+
+    for child, child_dict in yaml_data.items():
+        fname = root_folder / Path(*genealogy) / child
+        child_genealogy = genealogy + [child]
+
+        if child_dict["type"] != "dataset":
+            if not fname.is_dir():
+                child_dict["PATH_ERROR"] = f"XXERRORXX folder {fname} does not exist"
+        else:
+            data_series = pd.Series(child_dict)
+            for k, v in data_series.pop("extra_attributes").items():
+                data_series[k] = v
+            data_series.id = None
+            data_series.name = "_".join(origin_genealogy + child_genealogy)
+            ds = flz.Dataset.from_flexilims(data_series=data_series)
+            msg = ds.is_valid(return_reason=True)
+            if msg:
+                child_dict["VALIDATION_ERROR"] = f"XXERRORXX {msg}"
+
+        if child_dict["genealogy"] != origin_genealogy + child_genealogy:
+            if fixerrors:
+                print(f"Fixing genealogy for {child}")
+                child_dict["genealogy"] = origin_genealogy + child_genealogy
+            else:
+                child_dict["GENEALOGY_ERROR"] = f"XXERRORXX genealogy is not correct"
+        if "children" in child_dict:
+            _check_recursively(
+                child_dict["children"],
+                origin_genealogy,
+                root_folder,
+                project,
+                genealogy=genealogy + [child],
+            )
 
 
-def _check_recursively(yaml, root_folder):
-    raise NotImplementedError
+if __name__ == "__main__":
+    rel = "blota_onix_pilote/BRAC7448.2d/"
+    root_folder = Path(flz.PARAMETERS["data_root"]["raw"]) / rel
+    yaml_file = Path(flz.PARAMETERS["data_root"]["processed"]) / rel / "S20230421.yml"
+    origin_name = "BRAC7448.2d"
+    check_yaml_validity(yaml_file, root_folder, origin_name)

From 79909a3935553a01cc2801066b311f8d86d27056 Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Fri, 2 Jun 2023 11:21:26 +0100
Subject: [PATCH 18/73] [gui] gui checks for erros before uploading

It is slow. SHould add progress indication
---
 flexiznam/gui/flexigui.py | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/flexiznam/gui/flexigui.py b/flexiznam/gui/flexigui.py
index a3f4aa1..0190e9b 100644
--- a/flexiznam/gui/flexigui.py
+++ b/flexiznam/gui/flexigui.py
@@ -26,7 +26,7 @@ def __init__(self):
         self._create_frames()
         self._setup_widgets()
         self._entity_by_itemid = {}
-
+        self.contains_errors = False
         self.data = {}
 
     def _setup_widgets(self):
@@ -112,6 +112,7 @@ def parse_folder(self):
             origin_name=self.origin_name.get(),
             format_yaml=True,
         )
+        data = flz.camp.sync_data.check_yaml_dict(data)
         self.data = data
         self.update_data()
 
@@ -211,6 +212,8 @@ def update_data(self, name_to_select=None):
             self.origin_name.set(self.data["origin_name"])
         if "root_folder" in self.data:
             self.root_folder.set(self.data["root_folder"])
+
+        self.contains_errors = False
         self._insert_yaml_data(self.data["children"], name_to_select=name_to_select)
 
     def _insert_yaml_data(self, data, parent="", name_to_select=None):
@@ -233,6 +236,7 @@ def _insert_yaml_data(self, data, parent="", name_to_select=None):
                     if isinstance(v, str)
                 ]
             ):
+                self.contains_errors = True
                 self.treeview.item(item, tags=("error",))
 
             self._entity_by_itemid[item] = (child, child_data)
@@ -265,15 +269,23 @@ def upload(self):
         if not self._check_options_are_set():
             return
 
-        data = dict(self.data)
-        if not data:
+        if not self.data:
             tk.messagebox.showerror("Error", "No data loaded")
             return
+
+        self.data = flz.camp.sync_data.check_yaml_validity(self.data)
+
+        if self.contains_errors:
+            tk.messagebox.showerror(
+                "Error",
+                "There are still errors. Please fix them before uploading",
+            )
+            return
+
+        data = dict(self.data)
         data["project"] = self.project.get()
         data["root_folder"] = self.root_folder.get()
-        if data["project"].startswith("XXERRORXX"):
-            print("Project name not set")
-            return
+
         flz.camp.sync_data.upload_yaml(
             source_yaml=data,
             raw_data_folder=data["root_folder"],

From 47060b473dc6080f024fe65942aa74f3219be42b Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Fri, 2 Jun 2023 11:30:29 +0100
Subject: [PATCH 19/73] [gui] add status bar and refactor

Put gui creation function together
---
 flexiznam/gui/flexigui.py | 118 ++++++++++++++++++++++----------------
 1 file changed, 70 insertions(+), 48 deletions(-)

diff --git a/flexiznam/gui/flexigui.py b/flexiznam/gui/flexigui.py
index 0190e9b..103a3f4 100644
--- a/flexiznam/gui/flexigui.py
+++ b/flexiznam/gui/flexigui.py
@@ -29,37 +29,47 @@ def __init__(self):
         self.contains_errors = False
         self.data = {}
 
+    ############# GUI setup methods #############
+    # These methods are used to create the GUI elements
+
     def _setup_widgets(self):
         self._create_frames()
         self._create_buttons()
         self._create_treeview()
         self._create_textview()
+        self._create_statusbar()
 
     def _create_frames(self):
-        self.frames["t"] = tk.Frame(self)
-        self.frames["t"].grid(
+        self.frames["T"] = tk.Frame(self)
+        self.frames["T"].grid(
             row=0, column=0, padx=10, pady=5, columnspan=2, sticky="nwe"
         )
-        self.frames["t"].rowconfigure(0, weight=1)
-        self.frames["t"].rowconfigure(1, weight=1)
+        self.frames["T"].rowconfigure(0, weight=1)
+        self.frames["T"].rowconfigure(1, weight=1)
         for i in range(10):
-            self.frames["t"].columnconfigure(i, weight=1)
-        self.frames["t"].columnconfigure(3, weight=10)
-        self.frames["bl"] = tk.Frame(self)
-        self.frames["bl"].grid(row=1, column=0, padx=10, pady=5, sticky="nsew")
-        self.frames["bl"].rowconfigure(0, weight=1)
-        self.frames["bl"].columnconfigure(0, weight=1)
-        self.frames["br"] = tk.Frame(self)
-        self.frames["br"].grid(row=1, column=1, padx=10, pady=5, sticky="nsew")
-        self.frames["br"].rowconfigure(0, weight=1)
-        self.frames["br"].rowconfigure(1, weight=30)
-        self.frames["br"].rowconfigure(2, weight=1)
-        self.frames["br"].columnconfigure(0, weight=1)
+            self.frames["T"].columnconfigure(i, weight=1)
+        self.frames["T"].columnconfigure(3, weight=10)
+        self.frames["L"] = tk.Frame(self)
+        self.frames["L"].grid(row=1, column=0, padx=10, pady=5, sticky="nsew")
+        self.frames["L"].rowconfigure(0, weight=1)
+        self.frames["L"].columnconfigure(0, weight=1)
+        self.frames["R"] = tk.Frame(self)
+        self.frames["R"].grid(row=1, column=1, padx=10, pady=5, sticky="nsew")
+        self.frames["R"].rowconfigure(0, weight=1)
+        self.frames["R"].rowconfigure(1, weight=30)
+        self.frames["R"].rowconfigure(2, weight=1)
+        self.frames["R"].columnconfigure(0, weight=1)
+        self.frames["B"] = tk.Frame(self)
+        self.frames["B"].grid(
+            row=2, column=0, columnspan=2, padx=10, pady=5, sticky="sew"
+        )
+        self.frames["B"].rowconfigure(0, weight=1)
+        self.frames["B"].columnconfigure(0, weight=10)
 
     def _create_treeview(self):
         # Create the Treeview
         self.treeview = CheckboxTreeview(
-            self.frames["bl"],
+            self.frames["L"],
             columns=("datatype",),
             selectmode="browse",
         )
@@ -73,51 +83,25 @@ def _create_treeview(self):
 
     def _create_textview(self):
         # Create the Text widget
-        tk.Label(self.frames["br"], text="Selected item:").grid(
+        tk.Label(self.frames["R"], text="Selected item:").grid(
             row=0,
             column=0,
             sticky="nw",
         )
         self.selected_item = tk.StringVar()
         self.selected_item.set("None")
-        l = tk.Label(self.frames["br"], textvariable=self.selected_item)
+        l = tk.Label(self.frames["R"], textvariable=self.selected_item)
         l.grid(row=0, column=1, sticky="new")
-        self.textview = tk.Text(self.frames["br"], width=40, height=10, wrap="none")
+        self.textview = tk.Text(self.frames["R"], width=40, height=10, wrap="none")
         self.textview.grid(row=1, column=0, sticky="nsew", columnspan=2)
         self.textview.bind("<<Modified>>", self.on_textview_change)
         self.update_item_btn = tk.Button(
-            self.frames["br"], text="Update item", command=self.update_item
+            self.frames["R"], text="Update item", command=self.update_item
         )
         self.update_item_btn.grid(row=2, column=1, sticky="nsw")
 
-    def _check_options_are_set(self, options=("project", "origin_name")):
-        init_values = dict(project="SELECT", origin_name="ENTER")
-        for option in options:
-            value = getattr(self, option).get()
-            if value.startswith(init_values[option]):
-                tk.messagebox.showerror("Error", f"Error: enter {option} first!")
-                return False
-        return True
-
-    def parse_folder(self):
-        if not self._check_options_are_set():
-            return
-        folder = tk.filedialog.askdirectory(
-            initialdir=self.root_folder.get(), title="Select directory to parse"
-        )
-        self.root_folder.set(folder)
-        data = flz.camp.sync_data.create_yaml_dict(
-            root_folder=folder,
-            project=self.project.get(),
-            origin_name=self.origin_name.get(),
-            format_yaml=True,
-        )
-        data = flz.camp.sync_data.check_yaml_dict(data)
-        self.data = data
-        self.update_data()
-
     def _create_buttons(self):
-        topf = self.frames["t"]
+        topf = self.frames["T"]
         self.parse_btn = tk.Button(topf, text="Parse", command=self.parse_folder)
         self.parse_btn.grid(row=0, column=0, sticky="w")
         self.load_btn = tk.Button(topf, text="Load", command=self.load_yaml)
@@ -163,6 +147,44 @@ def _create_buttons(self):
         self.chg_dir_btn = tk.Button(topf, text="...", command=self.chg_root_folder)
         self.chg_dir_btn.grid(row=1, column=10)
 
+    def _create_statusbar(self):
+        self.statusbar = tk.Label(
+            self.frames["B"], text="Ready", bd=1, relief=tk.SUNKEN
+        )
+        self.statusbar.grid(row=0, column=0, sticky="sw")
+
+    ############# GUI update methods #############
+    # These methods are used to actually do stuff with the GUI elements
+    def report(self, message):
+        self.statusbar["text"] = message
+        print(message)
+
+    def _check_options_are_set(self, options=("project", "origin_name")):
+        init_values = dict(project="SELECT", origin_name="ENTER")
+        for option in options:
+            value = getattr(self, option).get()
+            if value.startswith(init_values[option]):
+                tk.messagebox.showerror("Error", f"Error: enter {option} first!")
+                return False
+        return True
+
+    def parse_folder(self):
+        if not self._check_options_are_set():
+            return
+        folder = tk.filedialog.askdirectory(
+            initialdir=self.root_folder.get(), title="Select directory to parse"
+        )
+        self.root_folder.set(folder)
+        data = flz.camp.sync_data.create_yaml_dict(
+            root_folder=folder,
+            project=self.project.get(),
+            origin_name=self.origin_name.get(),
+            format_yaml=True,
+        )
+        data = flz.camp.sync_data.check_yaml_dict(data)
+        self.data = data
+        self.update_data()
+
     def chg_root_folder(self):
         self.root_folder.set(
             tk.filedialog.askdirectory(

From acdb3201fffbd9d0677e85206e87c59eb2c645ec Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Fri, 2 Jun 2023 12:01:49 +0100
Subject: [PATCH 20/73] [gui] update status bar to indicate progress

---
 flexiznam/gui/flexigui.py | 37 +++++++++++++++++++++++++++++--------
 1 file changed, 29 insertions(+), 8 deletions(-)

diff --git a/flexiznam/gui/flexigui.py b/flexiznam/gui/flexigui.py
index 103a3f4..01bee85 100644
--- a/flexiznam/gui/flexigui.py
+++ b/flexiznam/gui/flexigui.py
@@ -148,24 +148,29 @@ def _create_buttons(self):
         self.chg_dir_btn.grid(row=1, column=10)
 
     def _create_statusbar(self):
+        self.sb_msg = tk.StringVar()
         self.statusbar = tk.Label(
-            self.frames["B"], text="Ready", bd=1, relief=tk.SUNKEN
+            self.frames["B"], textvariable=self.sb_msg, bd=1, relief=tk.SUNKEN
         )
         self.statusbar.grid(row=0, column=0, sticky="sw")
+        self.sb_msg.set("Ready")
 
     ############# GUI update methods #############
     # These methods are used to actually do stuff with the GUI elements
     def report(self, message):
-        self.statusbar["text"] = message
+        self.sb_msg.set(message)
         print(message)
+        self.update()
 
     def _check_options_are_set(self, options=("project", "origin_name")):
+        self.report("Checking options")
         init_values = dict(project="SELECT", origin_name="ENTER")
         for option in options:
             value = getattr(self, option).get()
             if value.startswith(init_values[option]):
                 tk.messagebox.showerror("Error", f"Error: enter {option} first!")
                 return False
+        self.report("Options are set")
         return True
 
     def parse_folder(self):
@@ -174,6 +179,7 @@ def parse_folder(self):
         folder = tk.filedialog.askdirectory(
             initialdir=self.root_folder.get(), title="Select directory to parse"
         )
+        self.report(f"Parsing folder {folder}...")
         self.root_folder.set(folder)
         data = flz.camp.sync_data.create_yaml_dict(
             root_folder=folder,
@@ -181,11 +187,14 @@ def parse_folder(self):
             origin_name=self.origin_name.get(),
             format_yaml=True,
         )
+        self.report("Parsing done. Validating data...")
         data = flz.camp.sync_data.check_yaml_dict(data)
         self.data = data
         self.update_data()
+        self.report("Done")
 
     def chg_root_folder(self):
+        self.report("Changing root folder")
         self.root_folder.set(
             tk.filedialog.askdirectory(
                 initialdir=self.root_folder.get(), title="Select root directory"
@@ -195,17 +204,18 @@ def chg_root_folder(self):
     def on_treeview_select(self, event):
         item = self.treeview.focus()
         name, data = self._entity_by_itemid[item]
+        self.report(f"Selected item: {name}")
         self.selected_item.set(name)
         display = {k: v for k, v in data.items() if k not in self.FLEXILIMS_ONLY_FIELDS}
         self.textview.delete(1.0, tk.END)
         self.textview.insert(tk.END, yaml.dump(display))
 
     def on_textview_change(self, event):
-        print('Textview changed: "{}"'.format(event))
+        return
 
     def load_yaml(self):
         """Load a YAML file and display it in the treeview"""
-        print("Select YAML file to load")
+        self.report("Select YAML file to load")
         filetypes = (("Yaml files", "*.yml *.yaml"), ("All files", "*.*"))
 
         filename = tk.filedialog.askopenfilename(
@@ -213,10 +223,11 @@ def load_yaml(self):
         )
         if not filename:
             return
+        self.report(f"Loading YAML file {filename}...")
         with open(filename, "r") as f:
             self.data = yaml.safe_load(f)
-        print('Loaded YAML file "{}"'.format(filename))
         self.update_data()
+        self.report("Done")
 
     def update_data(self, name_to_select=None):
         """Update GUI data from self.data
@@ -224,6 +235,7 @@ def update_data(self, name_to_select=None):
         Args:
             name_to_select (str, optional): Name of item to select in treeview.
                 Defaults to None."""
+        self.report("Updating GUI")
         self.textview.delete("1.0", tk.END)
         self.selected_item.set("None")
         self.treeview.delete(*self.treeview.get_children())
@@ -259,6 +271,7 @@ def _insert_yaml_data(self, data, parent="", name_to_select=None):
                 ]
             ):
                 self.contains_errors = True
+                self.report(f"ERROR: {child} contains errors")
                 self.treeview.item(item, tags=("error",))
 
             self._entity_by_itemid[item] = (child, child_data)
@@ -273,17 +286,21 @@ def _insert_yaml_data(self, data, parent="", name_to_select=None):
 
     def write_yaml(self):
         """Write the current data to a YAML file"""
+        self.report("Select YAML file to write")
         target = tk.filedialog.asksaveasfilename(
             initialdir=self.root_folder.get(),
             title="Select YAML file to write",
             filetypes=(("Yaml files", "*.yml *.yaml"), ("All files", "*.*")),
         )
+        if not target:
+            self.report("No file selected. Cancel")
+            return
         data = dict(self.data)
         data["project"] = self.project.get()
         data["root_folder"] = self.root_folder.get()
         with open(target, "w") as f:
             yaml.dump(data, f)
-        print('Wrote YAML file "{}"'.format(target))
+        self.report('Wrote YAML file "{}"'.format(target))
 
     def upload(self):
         """Upload data to flexilims"""
@@ -295,6 +312,7 @@ def upload(self):
             tk.messagebox.showerror("Error", "No data loaded")
             return
 
+        self.report("Validating data...")
         self.data = flz.camp.sync_data.check_yaml_validity(self.data)
 
         if self.contains_errors:
@@ -307,7 +325,7 @@ def upload(self):
         data = dict(self.data)
         data["project"] = self.project.get()
         data["root_folder"] = self.root_folder.get()
-
+        self.report("Validating data...")
         flz.camp.sync_data.upload_yaml(
             source_yaml=data,
             raw_data_folder=data["root_folder"],
@@ -316,15 +334,17 @@ def upload(self):
             flexilims_session=None,
             conflicts=self.conflicts.get(),
         )
-        print("Done")
+        self.report("Done")
 
     def update_item(self):
         """Update the selected item with the textview contents"""
+
         text = self.textview.get(1.0, tk.END)
         if not text.strip():
             return
         item = self.treeview.focus()
         name, original_data = self._entity_by_itemid[item]
+        self.report(f"Updating item {name}")
         assert name == self.selected_item.get(), "Selected item does not match"
         data = yaml.safe_load(text)
         for field in self.FLEXILIMS_ONLY_FIELDS:
@@ -344,6 +364,7 @@ def update_item(self):
             ref = ref["children"][parent]
         ref["children"][name] = data
         self.update_data(name_to_select=name)
+        self.report("Done")
 
 
 if __name__ == "__main__":

From 0f07d125df85ee4ceaaab2227fe39be8d1005356 Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Fri, 2 Jun 2023 16:12:16 +0100
Subject: [PATCH 21/73] [bugfix] typo

---
 flexiznam/gui/flexigui.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flexiznam/gui/flexigui.py b/flexiznam/gui/flexigui.py
index 01bee85..91adf41 100644
--- a/flexiznam/gui/flexigui.py
+++ b/flexiznam/gui/flexigui.py
@@ -188,7 +188,7 @@ def parse_folder(self):
             format_yaml=True,
         )
         self.report("Parsing done. Validating data...")
-        data = flz.camp.sync_data.check_yaml_dict(data)
+        data = flz.camp.sync_data.check_yaml_validity(data)
         self.data = data
         self.update_data()
         self.report("Done")

From e31b8a78f3389e9dc6b3a8cc8bb8f40d525df5a2 Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Fri, 2 Jun 2023 16:15:16 +0100
Subject: [PATCH 22/73] [bugfix] add_dataset has no arg genealogy

---
 flexiznam/schema/datasets.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/flexiznam/schema/datasets.py b/flexiznam/schema/datasets.py
index c6c91d7..6216f6b 100644
--- a/flexiznam/schema/datasets.py
+++ b/flexiznam/schema/datasets.py
@@ -441,7 +441,6 @@ def update_flexilims(self, mode="safe"):
             dataset_type=self.dataset_type,
             created=self.created,
             path=str(PurePosixPath(self.path)),
-            genealogy=self.genealogy,
             is_raw="yes" if self.is_raw else "no",
             project_id=self.project_id,
             dataset_name=self.full_name,

From 16e31dcc624db9c38ad9151db395798b075d7187 Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Fri, 2 Jun 2023 17:28:53 +0100
Subject: [PATCH 23/73] [feature] add ds validation to seq and mic data

---
 flexiznam/schema/microscopy_data.py | 12 ++++++++++++
 flexiznam/schema/sequencing_data.py | 14 ++++++++++----
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/flexiznam/schema/microscopy_data.py b/flexiznam/schema/microscopy_data.py
index eda32cb..ab7d99c 100644
--- a/flexiznam/schema/microscopy_data.py
+++ b/flexiznam/schema/microscopy_data.py
@@ -139,3 +139,15 @@ def __init__(
             id=id,
             flexilims_session=flexilims_session,
         )
+
+    def is_valid(self, return_reason=False):
+        """Check that file exist
+
+        Args:
+            return_reason (bool): if True, return a string with the reason why the
+                                  dataset is not valid
+        Returns:"""
+        if not self.path_full.exists():
+            msg = f"{self.path_full} does not exist"
+            return msg if return_reason else False
+        return "" if return_reason else True
diff --git a/flexiznam/schema/sequencing_data.py b/flexiznam/schema/sequencing_data.py
index d226b68..6fad91d 100644
--- a/flexiznam/schema/sequencing_data.py
+++ b/flexiznam/schema/sequencing_data.py
@@ -126,8 +126,14 @@ def __init__(
             project_id=project_id,
         )
 
-    def is_valid(self):
-        """Check that the file exist"""
+    def is_valid(self, return_reason=False):
+        """Check that file exist
+
+        Args:
+            return_reason (bool): if True, return a string with the reason why the
+                                  dataset is not valid
+        Returns:"""
         if not self.path_full.exists():
-            return False
-        return True
+            msg = f"{self.path_full} does not exist"
+            return msg if return_reason else False
+        return "" if return_reason else True

From f04a54dbd6fe203650c636aa7f83a1bafe093f9e Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Fri, 2 Jun 2023 17:33:17 +0100
Subject: [PATCH 24/73] [sync] add project to new yaml syntax

---
 flexiznam/camp/sync_data.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/flexiznam/camp/sync_data.py b/flexiznam/camp/sync_data.py
index 6b106cd..3ab80fd 100644
--- a/flexiznam/camp/sync_data.py
+++ b/flexiznam/camp/sync_data.py
@@ -56,7 +56,12 @@ def create_yaml_dict(
         format_yaml=format_yaml,
         parent_dict=dict(),
     )
-    out = dict(root_folder=root_folder.parent, origin_name=origin_name, children=data)
+    out = dict(
+        root_folder=root_folder.parent,
+        origin_name=origin_name,
+        children=data,
+        project=project,
+    )
     return out
 
 

From 78b9dee6524cd0ed5579669f693b0960fac32818 Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Sat, 26 Aug 2023 18:21:08 +0100
Subject: [PATCH 25/73] [bugfix] Parse correctly folder with . in name

---
 flexiznam/camp/sync_data.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/flexiznam/camp/sync_data.py b/flexiznam/camp/sync_data.py
index 3ab80fd..ae06d87 100644
--- a/flexiznam/camp/sync_data.py
+++ b/flexiznam/camp/sync_data.py
@@ -192,7 +192,7 @@ def _create_yaml_dict(
     level_dict = dict()
     genealogy = list(genealogy)
 
-    level_name = level_folder.stem
+    level_name = level_folder.name
     m = re.fullmatch(r"R\d\d\d\d\d\d_?(.*)?", level_name)
     if m:
         level_dict["type"] = "recording"
@@ -238,7 +238,7 @@ def _create_yaml_dict(
                 parent_dict=children,
             )
     level_dict["children"] = children
-    parent_dict[level_folder.stem] = level_dict
+    parent_dict[level_name] = level_dict
     return parent_dict
 
 

From daaf215aa6b1ed54fd8ad5574407b32513dbb73e Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Tue, 29 Aug 2023 14:45:15 +0100
Subject: [PATCH 26/73] [feature] Add CLI entry point to GUI

---
 flexiznam/cli.py          | 14 +++++++++++++-
 flexiznam/gui/__init__.py |  1 +
 2 files changed, 14 insertions(+), 1 deletion(-)
 create mode 100644 flexiznam/gui/__init__.py

diff --git a/flexiznam/cli.py b/flexiznam/cli.py
index 6cc4dfc..b80e62e 100644
--- a/flexiznam/cli.py
+++ b/flexiznam/cli.py
@@ -6,6 +6,17 @@ def cli():
     pass
 
 
+@cli.command()
+@click.argument("root_folder", type=click.Path(exists=True), default=".")
+def gui(root_folder):
+    """Start the GUI"""
+    from flexiznam.gui import flexigui
+
+    app = flexigui.FlexiGui()
+    app.root_folder.set(root_folder)
+    app.mainloop()
+
+
 @cli.command()
 @click.option("-p", "--project_id", prompt="Enter the project ID", help="Project ID.")
 @click.option(
@@ -28,6 +39,7 @@ def cli():
     show_default=True,
 )
 def add_genealogy(project_id, name, recursive, verbose):
+    """Add genealogy to a flexilims entity"""
     from flexiznam import get_flexilims_session
 
     flm_sess = get_flexilims_session(project_id=project_id)
@@ -60,9 +72,9 @@ def add_mouse(
     flexilims_username=None,
     mcms_username=None,
 ):
+    """Add a single mouse to a project."""
     from flexiznam import main
 
-    """Add a single mouse to a project."""
     click.echo("Trying to add %s in %s" % (mouse_name, project_id))
     main.add_mouse(
         mouse_name=mouse_name,
diff --git a/flexiznam/gui/__init__.py b/flexiznam/gui/__init__.py
new file mode 100644
index 0000000..428404e
--- /dev/null
+++ b/flexiznam/gui/__init__.py
@@ -0,0 +1 @@
+from . import flexigui

From 3467daf10bab64181e5093e0304ba209b726a85d Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Tue, 29 Aug 2023 14:45:15 +0100
Subject: [PATCH 27/73] [feature] Add CLI entry point to GUI

---
 CHANGELOG.md              |  1 +
 flexiznam/cli.py          | 14 +++++++++++++-
 flexiznam/gui/__init__.py |  1 +
 3 files changed, 15 insertions(+), 1 deletion(-)
 create mode 100644 flexiznam/gui/__init__.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f6e5ca7..c99bd2e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,7 @@
 ### Main changes
 
 - New `SequencingData` class to handle sequencing data
+- GUI can now be used to add data to flexilims with `flexiznam gui`
 
 ## v0.3.5
 
diff --git a/flexiznam/cli.py b/flexiznam/cli.py
index 6cc4dfc..b80e62e 100644
--- a/flexiznam/cli.py
+++ b/flexiznam/cli.py
@@ -6,6 +6,17 @@ def cli():
     pass
 
 
+@cli.command()
+@click.argument("root_folder", type=click.Path(exists=True), default=".")
+def gui(root_folder):
+    """Start the GUI"""
+    from flexiznam.gui import flexigui
+
+    app = flexigui.FlexiGui()
+    app.root_folder.set(root_folder)
+    app.mainloop()
+
+
 @cli.command()
 @click.option("-p", "--project_id", prompt="Enter the project ID", help="Project ID.")
 @click.option(
@@ -28,6 +39,7 @@ def cli():
     show_default=True,
 )
 def add_genealogy(project_id, name, recursive, verbose):
+    """Add genealogy to a flexilims entity"""
     from flexiznam import get_flexilims_session
 
     flm_sess = get_flexilims_session(project_id=project_id)
@@ -60,9 +72,9 @@ def add_mouse(
     flexilims_username=None,
     mcms_username=None,
 ):
+    """Add a single mouse to a project."""
     from flexiznam import main
 
-    """Add a single mouse to a project."""
     click.echo("Trying to add %s in %s" % (mouse_name, project_id))
     main.add_mouse(
         mouse_name=mouse_name,
diff --git a/flexiznam/gui/__init__.py b/flexiznam/gui/__init__.py
new file mode 100644
index 0000000..428404e
--- /dev/null
+++ b/flexiznam/gui/__init__.py
@@ -0,0 +1 @@
+from . import flexigui

From 78c341fe13d0e3cbc7a763b9bad5c8b8653ecdbc Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Tue, 29 Aug 2023 18:22:36 +0100
Subject: [PATCH 28/73] [bugfix] adapt gui to new from_dataseries

---
 flexiznam/camp/sync_data.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/flexiznam/camp/sync_data.py b/flexiznam/camp/sync_data.py
index ae06d87..e1833ee 100644
--- a/flexiznam/camp/sync_data.py
+++ b/flexiznam/camp/sync_data.py
@@ -336,7 +336,8 @@ def _check_recursively(
                 data_series[k] = v
             data_series.id = None
             data_series.name = "_".join(origin_genealogy + child_genealogy)
-            ds = flz.Dataset.from_flexilims(data_series=data_series)
+            ds = flz.Dataset.from_dataseries(data_series)
+            ds.project = project
             msg = ds.is_valid(return_reason=True)
             if msg:
                 child_dict["VALIDATION_ERROR"] = f"XXERRORXX {msg}"

From c9850e546b47b7d7e7a6a3ade4d402efcc0ba6d7 Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Tue, 29 Aug 2023 18:30:51 +0100
Subject: [PATCH 29/73] [bugfix] Onix data from_folder timestamp issue

If the protocol start at then end of a second some timestamps are not exactly
the same.
Now can load these dataset if the timestamps are not off by more than 2s
---
 flexiznam/schema/onix_data.py | 55 ++++++++++++++++++-----------------
 1 file changed, 28 insertions(+), 27 deletions(-)

diff --git a/flexiznam/schema/onix_data.py b/flexiznam/schema/onix_data.py
index cc09ac0..1d34ec1 100644
--- a/flexiznam/schema/onix_data.py
+++ b/flexiznam/schema/onix_data.py
@@ -79,33 +79,34 @@ def from_folder(
 
         data = pd.DataFrame(data)
         output = dict()
-        for ts, df in data.groupby("timestamp"):
-            if (
-                enforce_validity
-                and ("rhd2164" not in df.device_name.values)
-                or ("breakout" not in df.device_name.values)
-            ):
-                if verbose:
-                    print(
-                        "Skipping partial onix dataset %s"
-                        % ts.strftime("%Y-%m-%d_%H_%M_%S")
-                    )
-                continue
-            onix_name = "onix_data_%s" % ts.strftime("%Y-%m-%d_%H_%M_%S")
-            extra_attributes = dict()
-            for device, dev_df in df.groupby("device_name"):
-                extra_attributes[device] = {
-                    s.subname: s.file for s in dev_df.itertuples()
-                }
-            output[onix_name] = OnixData(
-                path=folder,
-                genealogy=folder_genealogy + (onix_name,),
-                extra_attributes=extra_attributes,
-                created=ts.strftime("%Y-%m-%d " "%H:%M:%S"),
-                flexilims_session=flexilims_session,
-                project=project,
-                is_raw=is_raw,
-            )
+        if max(data.timestamp - data.timestamp.min()).total_seconds() > 2:
+            raise IOError(f"Multiple timestamps found in folder {folder}")
+
+        ts = data.timestamp.min()
+        if (
+            enforce_validity
+            and ("rhd2164" not in data.device_name.values)
+            or ("breakout" not in data.device_name.values)
+        ):
+            if verbose:
+                print(
+                    "Skipping partial onix dataset %s"
+                    % ts.strftime("%Y-%m-%d_%H_%M_%S")
+                )
+            return
+        onix_name = "onix_data_%s" % ts.strftime("%Y-%m-%d_%H_%M_%S")
+        extra_attributes = dict()
+        for device, dev_df in data.groupby("device_name"):
+            extra_attributes[device] = {s.subname: s.file for s in dev_df.itertuples()}
+        output[onix_name] = OnixData(
+            path=folder,
+            genealogy=folder_genealogy + (onix_name,),
+            extra_attributes=extra_attributes,
+            created=ts.strftime("%Y-%m-%d " "%H:%M:%S"),
+            flexilims_session=flexilims_session,
+            project=project,
+            is_raw=is_raw,
+        )
         return output
 
     def __init__(

From 1708e2c85c7171b25954b83f1895cc625fd8e1f4 Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Wed, 30 Aug 2023 09:47:20 +0100
Subject: [PATCH 30/73] [feature] `check_flm_issues` can add missing paths

---
 CHANGELOG.md     | 6 ++++++
 flexiznam/cli.py | 8 +++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 053d5f6..5dc0593 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # Change log
 
+## v0.4.0
+
+### Main changes
+
+- `check_flexilims_issues` can now add missing paths
+
 ## v0.3.8
 
 ### Main changes
diff --git a/flexiznam/cli.py b/flexiznam/cli.py
index 734cc3f..5562b46 100644
--- a/flexiznam/cli.py
+++ b/flexiznam/cli.py
@@ -316,7 +316,10 @@ def yaml_to_flexilims(source_yaml, raw_data_folder=None, conflicts=None):
 @click.option("-t", "--target_file", default=None, help="Path to write csv output.")
 @click.option("-r", "--root_name", default=None, help="Root entity to start the check.")
 @click.option("--flexilims_username", default=None, help="Your username on flexilims.")
-def check_flexilims_issues(project_id, target_file, root_name, flexilims_username):
+@click.option("--add-path/--no-add-path", default=False, help="Add missing paths.")
+def check_flexilims_issues(
+    project_id, target_file, root_name, flexilims_username, add_path
+):
     """Check that database is properly formatted
 
     This will check recursively all mice if `root_name` is not provided. Elements that
@@ -350,3 +353,6 @@ def check_flexilims_issues(project_id, target_file, root_name, flexilims_usernam
     else:
         df = pdf
     df.to_csv(target_file)
+    if add_path:
+        print("Adding missing paths")
+        utils.add_missing_paths(flexilims_session, root_name=root_name)

From 05f3b991502e7f915dcb4008616a6434ab90518d Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Thu, 31 Aug 2023 09:27:06 +0100
Subject: [PATCH 31/73] [minor] clearer error message in get_id

---
 flexiznam/main.py | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/flexiznam/main.py b/flexiznam/main.py
index 2ec0867..04d2310 100755
--- a/flexiznam/main.py
+++ b/flexiznam/main.py
@@ -45,8 +45,9 @@ def get_data_root(which, project=None, flexilims_session=None):
         project = flexilims_session.project_id
 
     if project not in PARAMETERS["project_ids"]:
-        project = lookup_project(project, prm=None)
-        assert project is not None, f"Invalid project {project}"
+        proj = lookup_project(project, prm=None)
+        assert proj is not None, f"Invalid project {project}"
+        project = proj
 
     if project in PARAMETERS["project_paths"]:
         return Path(PARAMETERS["project_paths"][project][which])
@@ -798,7 +799,7 @@ def get_entities(
         :py:class:`pandas.DataFrame`: containing all matching entities
 
     """
-    assert (project_id is not None) or (flexilims_session is not None)
+    # assert (project_id is not None) or (flexilims_session is not None)
     if flexilims_session is None:
         flexilims_session = get_flexilims_session(project_id)
     results = flexilims_session.get(
@@ -942,6 +943,8 @@ def get_id(name, datatype=None, project_id=None, flexilims_session=None):
     entity = get_entity(
         datatype=datatype, flexilims_session=flexilims_session, name=name
     )
+    if entity is None:
+        raise FlexilimsError("Cannot find entity named `%s`" % name)
     return entity["id"]
 
 
@@ -1081,10 +1084,27 @@ def get_datasets_recursively(
     For example, this is useful if you want to retrieve paths to all *scanimage*
     datasets associated with a given session.
 
+    Args:
+        origin_id (str): hexadecimal ID of the origin session. Not required if
+            origin_name is provided.
+        origin_name (str): text name of the origin session. Not required if origin_id
+            is provided.
+        origin_series (pandas.Series): series of the origin session. Not required if
+            origin_id or origin_name is provided.
+        dataset_type (str): type of the dataseet to filter by. If `None`,
+            will return all datasets.
+        filter_datasets (dict): dictionary of key-value pairs to filter datasets by.
+        parent_type (str): type of the parent entity. If `None`, will return all
+        filter_parents (dict): dictionary of key-value pairs to filter parents by.
+        return_paths (bool): if True, return a list of paths
+        project_id (str): text name of the project. Not required if
+            `flexilims_session` is provided.
+        flexilims_session (:py:class:`flexilims.Flexilims`): Flexylims session object
+        _output (list): internal argument used for recursion.
+
     Returns:
         dict: Dictionary with direct parent id as keys and lists of associated
             datasets, or dataset paths as values
-
     """
     if origin_series is None:
         if origin_id is None:
@@ -1168,7 +1188,7 @@ def get_datasets(
             otherwise ensure that only one dataset exists online and return it.
         return_paths (bool): if True, return a list of paths
         return_dataseries (bool): if True, a dataframe or a dataseries
-        _output (list): internal argument used for recursion.
+
 
 
     """

From 91b0e121f29759be73c426e845d62463cda696bd Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Fri, 6 Oct 2023 09:58:01 +0100
Subject: [PATCH 32/73] [feature] add create yaml compatible with GUI

---
 flexiznam/camp/sync_data.py | 38 +++++++++++++++++++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

diff --git a/flexiznam/camp/sync_data.py b/flexiznam/camp/sync_data.py
index e1833ee..ab37407 100644
--- a/flexiznam/camp/sync_data.py
+++ b/flexiznam/camp/sync_data.py
@@ -16,6 +16,36 @@
 from flexiznam.utils import clean_recursively
 
 
+def create_yaml(root_folder, project, origin_name, output_file, overwrite=False):
+    """Create a yaml file from a folder
+
+    Args:
+        root_folder (str): Folder to parse
+        project (str): Name of the project
+        origin_name (str): Name of the origin on flexilims
+        output_file (str): Full path to output yaml.
+        overwrite (bool, optional): Overwrite output file if it exists. Defaults to False.
+    """
+    output_file = pathlib.Path(output_file)
+    if (not overwrite) and output_file.exists():
+        s = input("File %s already exists. Overwrite (yes/[no])? " % output_file)
+        if s == "yes":
+            overwrite = True
+        else:
+            raise (
+                FileExistsError(
+                    "File %s already exists and overwrite is not allowed" % output_file
+                )
+            )
+    root_folder = pathlib.Path(root_folder)
+    if not root_folder.is_dir():
+        raise FileNotFoundError("source_dir %s is not a directory" % root_folder)
+
+    data = create_yaml_dict(root_folder, project, origin_name)
+    with open(output_file, "w") as f:
+        yaml.dump(data, f)
+
+
 def create_yaml_dict(
     root_folder,
     project,
@@ -56,8 +86,12 @@ def create_yaml_dict(
         format_yaml=format_yaml,
         parent_dict=dict(),
     )
+    if format_yaml:
+        root_folder = str(root_folder.parent)
+    else:
+        root_folder = root_folder.parent
     out = dict(
-        root_folder=root_folder.parent,
+        root_folder=root_folder,
         origin_name=origin_name,
         children=data,
         project=project,
@@ -132,7 +166,7 @@ def upload_yaml(
         list of names of entities created/updated
 
     """
-    if isinstance(source_yaml, str):
+    if isinstance(source_yaml, str) or isinstance(source_yaml, Path):
         source_yaml = Path(source_yaml)
         with open(source_yaml, "r") as f:
             yaml_data = yaml.safe_load(f)

From 2b3d4339a3a3ef6f600013fa205e6b00a901f56a Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Fri, 6 Oct 2023 09:58:34 +0100
Subject: [PATCH 33/73] [bugfix] adapt CLI to new create_yaml

---
 flexiznam/cli.py | 35 +++++++----------------------------
 1 file changed, 7 insertions(+), 28 deletions(-)

diff --git a/flexiznam/cli.py b/flexiznam/cli.py
index ed7629b..bc25616 100644
--- a/flexiznam/cli.py
+++ b/flexiznam/cli.py
@@ -180,7 +180,9 @@ def add_password(app, username, password, password_file):
 @click.option(
     "-p", "--project", default="NOT SPECIFIED", help="Project name on flexilims."
 )
-@click.option("-m", "--mouse", default="NOT SPECIFIED", help="Mouse name on flexilims.")
+@click.option(
+    "-o", "--origin", default="NOT SPECIFIED", help="Origin name on flexilims."
+)
 @click.option(
     "--overwrite/--no-overwrite",
     default=False,
@@ -191,38 +193,15 @@ def add_password(app, username, password, password_file):
     default=False,
     help="After creating the yaml skeleton, should I also parse it?",
 )
-@click.option(
-    "-r",
-    "--raw_data_folder",
-    default=None,
-    help="Path to the root folder containing raw data. Only used with " "`--process`",
-)
-def create_yaml(
-    source_dir, target_yaml, project, mouse, overwrite, process, raw_data_folder
-):
+def create_yaml(source_dir, target_yaml, project, origin, overwrite, process):
     """Create a yaml file by looking recursively in `root_dir`"""
     from flexiznam import camp
-    import pathlib
 
-    target_yaml = pathlib.Path(target_yaml)
-    if (not overwrite) and target_yaml.exists():
-        s = input("File %s already exists. Overwrite (yes/[no])? " % target_yaml)
-        if s == "yes":
-            overwrite = True
-        else:
-            raise (
-                FileExistsError(
-                    "File %s already exists and overwrite is not allowed" % target_yaml
-                )
-            )
-    source_dir = pathlib.Path(source_dir)
-    if not source_dir.is_dir():
-        raise FileNotFoundError("source_dir %s is not a directory" % source_dir)
-    yml_content = camp.sync_data.create_yaml(
+    camp.sync_data.create_yaml(
         root_folder=source_dir,
-        outfile=target_yaml,
+        output_file=target_yaml,
+        origin_name=origin,
         project=project,
-        mouse=mouse,
         overwrite=overwrite,
     )
     click.echo("Created yml skeleton in %s" % target_yaml)

From df64b070433908b2ccfdfaa09c1f9d13dc4b4947 Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Fri, 6 Oct 2023 10:44:15 +0100
Subject: [PATCH 34/73] [feature] add parse_yaml that I gui compatible

To parse existing yaml files
---
 flexiznam/camp/sync_data.py | 153 +++++++++++++++++++++++++++++-------
 1 file changed, 126 insertions(+), 27 deletions(-)

diff --git a/flexiznam/camp/sync_data.py b/flexiznam/camp/sync_data.py
index ab37407..0b3658a 100644
--- a/flexiznam/camp/sync_data.py
+++ b/flexiznam/camp/sync_data.py
@@ -16,11 +16,11 @@
 from flexiznam.utils import clean_recursively
 
 
-def create_yaml(root_folder, project, origin_name, output_file, overwrite=False):
+def create_yaml(folder_to_parse, project, origin_name, output_file, overwrite=False):
     """Create a yaml file from a folder
 
     Args:
-        root_folder (str): Folder to parse
+        folder_to_parse (str): Folder to parse
         project (str): Name of the project
         origin_name (str): Name of the origin on flexilims
         output_file (str): Full path to output yaml.
@@ -37,17 +37,17 @@ def create_yaml(root_folder, project, origin_name, output_file, overwrite=False)
                     "File %s already exists and overwrite is not allowed" % output_file
                 )
             )
-    root_folder = pathlib.Path(root_folder)
-    if not root_folder.is_dir():
-        raise FileNotFoundError("source_dir %s is not a directory" % root_folder)
+    folder_to_parse = pathlib.Path(folder_to_parse)
+    if not folder_to_parse.is_dir():
+        raise FileNotFoundError("source_dir %s is not a directory" % folder_to_parse)
 
-    data = create_yaml_dict(root_folder, project, origin_name)
+    data = create_yaml_dict(folder_to_parse, project, origin_name)
     with open(output_file, "w") as f:
         yaml.dump(data, f)
 
 
 def create_yaml_dict(
-    root_folder,
+    folder_to_parse,
     project,
     origin_name,
     format_yaml=True,
@@ -57,7 +57,7 @@ def create_yaml_dict(
     Recursively parse a folder and create a yaml dict with the structure of the folder.
 
     Args:
-        root_folder (str): Path to the folder to parse
+        folder_to_parse (str): Path to the folder to parse
         project (str): Name of the project, used as root of the path in the output
         origin_name (str): Name of the origin on flexilims. Must be online and have
             genealogy set.
@@ -76,20 +76,86 @@ def create_yaml_dict(
     assert origin is not None, f"Origin {origin_name} not found in project {project}"
     assert "genealogy" in origin, f"Origin {origin_name} has no genealogy"
     genealogy = origin["genealogy"]
-    root_folder = Path(root_folder)
-    assert root_folder.is_dir(), f"Folder {root_folder} does not exist"
+    folder_to_parse = Path(folder_to_parse)
+    assert folder_to_parse.is_dir(), f"Folder {folder_to_parse} does not exist"
 
     data = _create_yaml_dict(
-        level_folder=root_folder,
+        level_folder=folder_to_parse,
         project=project,
         genealogy=genealogy,
         format_yaml=format_yaml,
         parent_dict=dict(),
     )
     if format_yaml:
-        root_folder = str(root_folder.parent)
+        root_folder = str(folder_to_parse.parent)
     else:
-        root_folder = root_folder.parent
+        root_folder = folder_to_parse.parent
+    out = dict(
+        root_folder=root_folder,
+        origin_name=origin_name,
+        children=data,
+        project=project,
+    )
+    return out
+
+
+def parse_yaml(
+    yaml_file,
+    root_folder=None,
+    origin_name=None,
+    project=None,
+    format_yaml=True,
+):
+    """Parse a yaml file and check validity
+
+    This will add datasets to each existing levels of the yaml, but won't create
+    nested levels
+
+    Args:
+        yaml_file (str): path to the yaml file
+        root_folder (str): path to the root folder. If not provided, will be read from
+            the yaml file. This is the folder that contains the main folder, so "mouse"
+            for a  "session".
+        origin_name (str): name of the origin on flexilims. If not provided, will be
+            read from the yaml file
+        project (str): name of the project. If not provided, will be read from the yaml
+            file
+        format_yaml (bool, optional): Format the output to be yaml compatible if True,
+            otherwise keep dataset as Dataset object and path as pathlib.Path. Defaults
+            to True.
+    Returns
+        dict: yaml dict with datasets added
+    """
+    yaml_data = check_yaml_validity(yaml_file, root_folder, origin_name, project)
+    if root_folder is None:
+        root_folder = Path(yaml_data["root_folder"])
+    assert root_folder.is_dir(), f"Folder {root_folder} does not exist"
+
+    if project is None:
+        project = yaml_data["project"]
+    flm_sess = flz.get_flexilims_session(project_id=project)
+
+    if origin_name is None:
+        origin_name = yaml_data["origin_name"]
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        origin = flz.get_entity(name=origin_name, flexilims_session=flm_sess)
+    assert origin is not None, f"Origin {origin_name} not found in project {project}"
+    assert "genealogy" in origin, f"Origin {origin_name} has no genealogy"
+    genealogy = origin["genealogy"]
+
+    assert len(yaml_data["children"]) == 1, "Parsing only one folder is allowed"
+    child = list(yaml_data["children"].keys())[0]
+    data = _create_yaml_dict(
+        level_folder=root_folder / child,
+        project=project,
+        genealogy=genealogy,
+        format_yaml=format_yaml,
+        parent_dict=yaml_data["children"],
+        only_datasets=True,
+    )
+    if format_yaml:
+        root_folder = str(root_folder)
     out = dict(
         root_folder=root_folder,
         origin_name=origin_name,
@@ -204,6 +270,7 @@ def _create_yaml_dict(
     genealogy,
     format_yaml,
     parent_dict,
+    only_datasets=False,
 ):
     """Private function to create a yaml dict from a folder
 
@@ -219,34 +286,59 @@ def _create_yaml_dict(
         format_yaml (bool): format results to be yaml compatible or keep Dataset
             and pathlib.Path objects
         parent_dict (dict): dict of the parent folder. Used for recursion
+        only_datasets (bool): only parse datasets, not folders
     """
 
     level_folder = Path(level_folder)
     assert level_folder.is_dir(), "root_folder must be a directory"
-    level_dict = dict()
+    level_name = level_folder.name
+    if level_name in parent_dict:
+        level_dict = parent_dict[level_name]
+    else:
+        level_dict = dict()
     genealogy = list(genealogy)
 
-    level_name = level_folder.name
     m = re.fullmatch(r"R\d\d\d\d\d\d_?(.*)?", level_name)
     if m:
-        level_dict["type"] = "recording"
-        level_dict["protocol"] = (
-            m[1] if m[1] is not None else "XXERRORXX PROTOCOL NOT SPECIFIED"
-        )
-        level_dict["recording_type"] = "XXERRORXX error RECORDING TYPE NOT SPECIFIED"
-
+        if "type" in level_dict:
+            assert (
+                level_dict["type"] == "recording"
+            ), "Conflicting types, expected recording"
+        else:
+            level_dict["type"] = "recording"
+        if "protocol" not in level_dict:
+            level_dict["protocol"] = (
+                m[1] if m[1] is not None else "XXERRORXX PROTOCOL NOT SPECIFIED"
+            )
+        if "recording_type" not in level_dict:
+            level_dict["recording_type"] = "XXERRORXX RECORDING TYPE NOT SPECIFIED"
     elif re.fullmatch(r"S\d*", level_name):
-        level_dict["type"] = "session"
+        if "type" in level_dict:
+            assert (
+                level_dict["type"] == "session"
+            ), "Conflicting types, expected session"
+        else:
+            level_dict["type"] = "session"
     else:
-        level_dict["type"] = "sample"
-    level_dict["genealogy"] = genealogy + [level_name]
-    level_dict["path"] = Path(project, *level_dict["genealogy"])
+        if "type" not in level_dict:
+            level_dict["type"] = "sample"
+    if "genealogy" in level_dict:
+        assert level_dict["genealogy"] == genealogy + [
+            level_name
+        ], f"Conflicting genealogy for {level_name}"
+    else:
+        level_dict["genealogy"] = genealogy + [level_name]
+    if "path" not in level_dict:
+        level_dict["path"] = Path(project, *level_dict["genealogy"])
     if format_yaml:
         level_dict["path"] = str(PurePosixPath(level_dict["path"]))
-    children = dict()
+    children = dict() if "children" not in level_dict else level_dict["children"]
     datasets = Dataset.from_folder(level_folder)
     if datasets:
         for ds_name, ds in datasets.items():
+            if ds_name in children:
+                warnings.warn(f"Dataset {ds_name} already exists in {level_name}. Skip")
+                continue
             ds.genealogy = genealogy + list(ds.genealogy)
             if format_yaml:
                 # find path root
@@ -262,7 +354,14 @@ def _create_yaml_dict(
             else:
                 children[ds_name] = ds
 
-    for child in level_folder.glob("*"):
+    if only_datasets:
+        subfolders = [
+            level_folder / n for n, c in children.items() if c["type"] != "dataset"
+        ]
+    else:
+        subfolders = level_folder.glob("*")
+
+    for child in subfolders:
         if child.is_dir():
             _create_yaml_dict(
                 child,

From ce0b20e84f84f5cb9d001d3a8f20340b7b31d647 Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Fri, 6 Oct 2023 11:38:20 +0100
Subject: [PATCH 35/73] [feature] adapt parse yaml to empty levels

One can just give a recording name, with nothing below
---
 flexiznam/camp/sync_data.py | 34 ++++++++++++++++++++++++++++++----
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/flexiznam/camp/sync_data.py b/flexiznam/camp/sync_data.py
index 0b3658a..0710471 100644
--- a/flexiznam/camp/sync_data.py
+++ b/flexiznam/camp/sync_data.py
@@ -100,7 +100,7 @@ def create_yaml_dict(
 
 
 def parse_yaml(
-    yaml_file,
+    yaml_data,
     root_folder=None,
     origin_name=None,
     project=None,
@@ -112,7 +112,7 @@ def parse_yaml(
     nested levels
 
     Args:
-        yaml_file (str): path to the yaml file
+        yaml_file (str): path to the yaml file (or data as dict)
         root_folder (str): path to the root folder. If not provided, will be read from
             the yaml file. This is the folder that contains the main folder, so "mouse"
             for a  "session".
@@ -126,7 +126,10 @@ def parse_yaml(
     Returns
         dict: yaml dict with datasets added
     """
-    yaml_data = check_yaml_validity(yaml_file, root_folder, origin_name, project)
+    if isinstance(yaml_data, str) or isinstance(yaml_data, Path):
+        with open(yaml_data, "r") as f:
+            yaml_data = yaml.safe_load(f)
+
     if root_folder is None:
         root_folder = Path(yaml_data["root_folder"])
     assert root_folder.is_dir(), f"Folder {root_folder} does not exist"
@@ -162,10 +165,29 @@ def parse_yaml(
         children=data,
         project=project,
     )
+    yaml_data = check_yaml_validity(yaml_data, root_folder, origin_name, project)
     return out
 
 
 def check_yaml_validity(yaml_data, root_folder=None, origin_name=None, project=None):
+    """Check that a yaml file is valid
+
+    This will check that the genealogy is correct, that the datasets are valid and
+    that the folder structure is correct
+
+    Args:
+        yaml_file (str): path to the yaml file (or data as dict)
+        root_folder (str): path to the root folder. If not provided, will be read from
+            the yaml file. This is the folder that contains the main folder, so "mouse"
+            for a  "session".
+        origin_name (str): name of the origin on flexilims. If not provided, will be
+            read from the yaml file
+        project (str): name of the project. If not provided, will be read from the yaml
+            file
+
+    Returns:
+        dict: same as input yaml_data, but with errors added
+    """
     if isinstance(yaml_data, str) or isinstance(yaml_data, Path):
         with open(yaml_data, "r") as f:
             yaml_data = yaml.safe_load(f)
@@ -294,6 +316,8 @@ def _create_yaml_dict(
     level_name = level_folder.name
     if level_name in parent_dict:
         level_dict = parent_dict[level_name]
+        if level_dict is None:
+            level_dict = dict()
     else:
         level_dict = dict()
     genealogy = list(genealogy)
@@ -356,7 +380,9 @@ def _create_yaml_dict(
 
     if only_datasets:
         subfolders = [
-            level_folder / n for n, c in children.items() if c["type"] != "dataset"
+            level_folder / n
+            for n, c in children.items()
+            if (c is None) or (c.get("type", "unknown") != "dataset")
         ]
     else:
         subfolders = level_folder.glob("*")

From 4aee89e09036d44b47338ad2581146632fbc1a33 Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Fri, 6 Oct 2023 16:42:18 +0100
Subject: [PATCH 36/73] [bugfix] typo in parse_folder

---
 flexiznam/gui/flexigui.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flexiznam/gui/flexigui.py b/flexiznam/gui/flexigui.py
index 91adf41..c3916ff 100644
--- a/flexiznam/gui/flexigui.py
+++ b/flexiznam/gui/flexigui.py
@@ -182,7 +182,7 @@ def parse_folder(self):
         self.report(f"Parsing folder {folder}...")
         self.root_folder.set(folder)
         data = flz.camp.sync_data.create_yaml_dict(
-            root_folder=folder,
+            folder_to_parse=folder,
             project=self.project.get(),
             origin_name=self.origin_name.get(),
             format_yaml=True,

From 73990b74ca931026fe16e8eae346ce77d01c2e72 Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Fri, 13 Oct 2023 16:33:56 +0100
Subject: [PATCH 37/73] [feature] get tickbox of the gui to do stuff

fixes #116
---
 flexiznam/camp/sync_data.py | 30 +++++++++++++---
 flexiznam/gui/flexigui.py   | 71 +++++++++++++++++++++++++++++++++----
 2 files changed, 90 insertions(+), 11 deletions(-)

diff --git a/flexiznam/camp/sync_data.py b/flexiznam/camp/sync_data.py
index 0710471..de4c0fd 100644
--- a/flexiznam/camp/sync_data.py
+++ b/flexiznam/camp/sync_data.py
@@ -165,7 +165,10 @@ def parse_yaml(
         children=data,
         project=project,
     )
-    yaml_data = check_yaml_validity(yaml_data, root_folder, origin_name, project)
+    yaml_data, errors = check_yaml_validity(
+        yaml_data, root_folder, origin_name, project
+    )
+
     return out
 
 
@@ -218,14 +221,14 @@ def check_yaml_validity(yaml_data, root_folder=None, origin_name=None, project=N
         origin = flz.get_entity(name=origin_name, flexilims_session=flm_sess)
     assert hasattr(origin, "genealogy"), f"Origin {origin_name} has no genealogy"
 
-    _check_recursively(
+    errors = _check_recursively(
         yaml_data["children"],
         origin_genealogy=origin["genealogy"],
         root_folder=root_folder,
         project=project,
         genealogy=[],
     )
-    return yaml_data
+    return yaml_data, errors
 
 
 def upload_yaml(
@@ -478,8 +481,16 @@ def _upload_yaml_dict(
 
 
 def _check_recursively(
-    yaml_data, origin_genealogy, root_folder, project, genealogy, fixerrors=False
+    yaml_data,
+    origin_genealogy,
+    root_folder,
+    project,
+    genealogy,
+    fixerrors=False,
+    errors=None,
 ):
+    if errors is None:
+        errors = dict()
     root_folder = Path(root_folder)
 
     for child, child_dict in yaml_data.items():
@@ -489,6 +500,7 @@ def _check_recursively(
         if child_dict["type"] != "dataset":
             if not fname.is_dir():
                 child_dict["PATH_ERROR"] = f"XXERRORXX folder {fname} does not exist"
+                errors[fname] = child_dict
         else:
             data_series = pd.Series(child_dict)
             for k, v in data_series.pop("extra_attributes").items():
@@ -500,6 +512,7 @@ def _check_recursively(
             msg = ds.is_valid(return_reason=True)
             if msg:
                 child_dict["VALIDATION_ERROR"] = f"XXERRORXX {msg}"
+                errors[fname] = child_dict
 
         if child_dict["genealogy"] != origin_genealogy + child_genealogy:
             if fixerrors:
@@ -507,6 +520,7 @@ def _check_recursively(
                 child_dict["genealogy"] = origin_genealogy + child_genealogy
             else:
                 child_dict["GENEALOGY_ERROR"] = f"XXERRORXX genealogy is not correct"
+                errors[fname] = child_dict
         if "children" in child_dict:
             _check_recursively(
                 child_dict["children"],
@@ -514,10 +528,18 @@ def _check_recursively(
                 root_folder,
                 project,
                 genealogy=genealogy + [child],
+                fixerrors=fixerrors,
+                errors=errors,
             )
+    return errors
 
 
 if __name__ == "__main__":
+    example_yml = "/Users/blota/Desktop/test_yaml.yml"
+    out = parse_yaml(example_yml)
+    with open("/Users/blota/Desktop/test_yaml_redump.yml", "w") as f:
+        yaml.dump(out, f)
+
     rel = "blota_onix_pilote/BRAC7448.2d/"
     root_folder = Path(flz.PARAMETERS["data_root"]["raw"]) / rel
     yaml_file = Path(flz.PARAMETERS["data_root"]["processed"]) / rel / "S20230421.yml"
diff --git a/flexiznam/gui/flexigui.py b/flexiznam/gui/flexigui.py
index c3916ff..cba5684 100644
--- a/flexiznam/gui/flexigui.py
+++ b/flexiznam/gui/flexigui.py
@@ -157,6 +157,22 @@ def _create_statusbar(self):
 
     ############# GUI update methods #############
     # These methods are used to actually do stuff with the GUI elements
+    def get_checked_data(self, item=None, checked_data=None):
+        if checked_data is None:
+            checked_data = dict(children=dict())
+            for k in ["project", "origin_name", "root_folder"]:
+                checked_data[k] = self.data[k]
+
+        for child in self.treeview.get_children(item=item):
+            if self.treeview.tag_has("checked", child):
+                name, data = self._entity_by_itemid[child]
+                data = data.copy()
+                if "children" in data:
+                    data["children"] = {}
+                data = self.get_checked_data(item=child, checked_data=data)
+                checked_data["children"][name] = data
+        return checked_data
+
     def report(self, message):
         self.sb_msg.set(message)
         print(message)
@@ -188,9 +204,11 @@ def parse_folder(self):
             format_yaml=True,
         )
         self.report("Parsing done. Validating data...")
-        data = flz.camp.sync_data.check_yaml_validity(data)
+        data, errors = flz.camp.sync_data.check_yaml_validity(data)
         self.data = data
-        self.update_data()
+        self.update_data(remove_unchecked=False)
+        checked = self.get_checked_data(item=None, checked_data=None)
+        assert checked == self.data
         self.report("Done")
 
     def chg_root_folder(self):
@@ -229,17 +247,20 @@ def load_yaml(self):
         self.update_data()
         self.report("Done")
 
-    def update_data(self, name_to_select=None):
+    def update_data(self, name_to_select=None, remove_unchecked=True):
         """Update GUI data from self.data
 
         Args:
             name_to_select (str, optional): Name of item to select in treeview.
                 Defaults to None."""
         self.report("Updating GUI")
+        if remove_unchecked:
+            self.data = self.get_checked_data()
         self.textview.delete("1.0", tk.END)
         self.selected_item.set("None")
         self.treeview.delete(*self.treeview.get_children())
         self._entity_by_itemid = {}
+
         if "project" in self.data:
             self.project.set(self.data["project"])
         if "origin_name" in self.data:
@@ -262,7 +283,6 @@ def _insert_yaml_data(self, data, parent="", name_to_select=None):
                 values=[dtype],
                 open=True,
             )
-            self.treeview.change_state(item, "checked")
             if any(
                 [
                     v.startswith("XXERRORXX")
@@ -272,8 +292,8 @@ def _insert_yaml_data(self, data, parent="", name_to_select=None):
             ):
                 self.contains_errors = True
                 self.report(f"ERROR: {child} contains errors")
-                self.treeview.item(item, tags=("error",))
-
+                self.treeview.item(item, tags=("error", "checked"))
+            self.treeview.change_state(item, "checked")
             self._entity_by_itemid[item] = (child, child_data)
             if name_to_select and child == name_to_select:
                 self.treeview.focus(item)
@@ -313,7 +333,8 @@ def upload(self):
             return
 
         self.report("Validating data...")
-        self.data = flz.camp.sync_data.check_yaml_validity(self.data)
+        self.update_data()
+        data, errors = flz.camp.sync_data.check_yaml_validity(self.get_checked_data())
 
         if self.contains_errors:
             tk.messagebox.showerror(
@@ -323,8 +344,16 @@ def upload(self):
             return
 
         data = dict(self.data)
+        # remove unchecked items
+        for item in self.treeview.get_children():
+            if not self.treeview.tag_has("checked", item):
+                name, _ = self._entity_by_itemid[item]
+                self.report(f"Removing item {name}")
+                data["children"].pop(name)
+
         data["project"] = self.project.get()
         data["root_folder"] = self.root_folder.get()
+
         self.report("Validating data...")
         flz.camp.sync_data.upload_yaml(
             source_yaml=data,
@@ -368,5 +397,33 @@ def update_item(self):
 
 
 if __name__ == "__main__":
+
+    def diffofdict(d1, d2, diff=None, level=""):
+        """Find differences between 2 dictionary of dictionaries"""
+
+        if diff is None:
+            diff = []
+        all_keys = set(list(d1.keys()) + list(d2.keys()))
+        for k in all_keys:
+            level = level + k + "."
+            if k not in d2:
+                diff.append(f"{level} (missing in d2)")
+            elif k not in d1:
+                diff.append(f"{level} (missing in d1)")
+            elif isinstance(d1[k], dict):
+                diff = diffofdict(d1[k], d2[k], diff, level)
+            elif d1[k] != d2[k]:
+                diff.append(f"{level} ({d1[k]} != {d2[k]})")
+        return diff
+
     app = FlexiGui()
+    app.root_folder.set(
+        "/Volumes/lab-znamenskiyp/data/instruments/raw_data/projects/blota_onix_pilote/BRYA142.5d/"
+    )
+    app.origin_name.set("BRYA142.5d")
+    app.project.set("blota_onix_pilote")
     app.mainloop()
+    df = diffofdict(app.data["children"], app.get_checked_data()["children"])
+    a = app.data["children"]["S20230915"]["children"]
+    b = app.get_checked_data()["children"]["S20230915"]["children"]
+    a == b

From f232d0a24a4b8b554a49bd4c4e56a3eeac0fee2e Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Fri, 20 Oct 2023 15:11:24 +0100
Subject: [PATCH 38/73] [minor] Clearer error message in create_yaml

---
 flexiznam/cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flexiznam/cli.py b/flexiznam/cli.py
index fd6bddc..ae0bf53 100644
--- a/flexiznam/cli.py
+++ b/flexiznam/cli.py
@@ -206,7 +206,7 @@ def create_yaml(source_dir, target_yaml, project, origin, overwrite, process):
     )
     click.echo("Created yml skeleton in %s" % target_yaml)
     if process:
-        raise NotImplementedError
+        raise NotImplementedError("Process yaml at creation is not implemented yet")
 
 
 @cli.command()

From 254e8705cc758cd29ed6ca5142a76b4d45114d85 Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Mon, 23 Oct 2023 13:55:49 +0100
Subject: [PATCH 39/73] [bugfix] tristate tag was not recognised in gui

---
 flexiznam/gui/flexigui.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/flexiznam/gui/flexigui.py b/flexiznam/gui/flexigui.py
index cba5684..bad2b49 100644
--- a/flexiznam/gui/flexigui.py
+++ b/flexiznam/gui/flexigui.py
@@ -164,7 +164,9 @@ def get_checked_data(self, item=None, checked_data=None):
                 checked_data[k] = self.data[k]
 
         for child in self.treeview.get_children(item=item):
-            if self.treeview.tag_has("checked", child):
+            if self.treeview.tag_has("checked", child) or self.treeview.tag_has(
+                "tristate", child
+            ):
                 name, data = self._entity_by_itemid[child]
                 data = data.copy()
                 if "children" in data:

From aefc1cb13b146fe7fbf91669beba5f841b2c7c63 Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Wed, 25 Oct 2023 09:44:25 +0100
Subject: [PATCH 40/73] [bugfix] scanimage valid with full path

---
 flexiznam/schema/scanimage_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flexiznam/schema/scanimage_data.py b/flexiznam/schema/scanimage_data.py
index a973a74..9468890 100644
--- a/flexiznam/schema/scanimage_data.py
+++ b/flexiznam/schema/scanimage_data.py
@@ -240,7 +240,7 @@ def is_valid(self, return_reason=False, tif_files=None):
         # checking file one by one is long, compare sets
         tif_files = set(tif_files)
         existing_file = {
-            f for f in os.listdir(self.path) if f.endswith(("tif", ".tiff"))
+            f for f in os.listdir(self.path_full) if f.endswith(("tif", ".tiff"))
         }
         if tif_files - existing_file:
             msg = "Some tif files do not exist: %s" % (tif_files - existing_file)

From 6604eb34ed0eccda6c7facd81f91748934daea82 Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Wed, 25 Oct 2023 09:58:22 +0100
Subject: [PATCH 41/73] [feature] relax constrain on recording type

Try to autoguess what it is and default to accepted "NOT SPECIFIED"
---
 flexiznam/camp/sync_data.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/flexiznam/camp/sync_data.py b/flexiznam/camp/sync_data.py
index de4c0fd..fd6139a 100644
--- a/flexiznam/camp/sync_data.py
+++ b/flexiznam/camp/sync_data.py
@@ -338,7 +338,14 @@ def _create_yaml_dict(
                 m[1] if m[1] is not None else "XXERRORXX PROTOCOL NOT SPECIFIED"
             )
         if "recording_type" not in level_dict:
-            level_dict["recording_type"] = "XXERRORXX RECORDING TYPE NOT SPECIFIED"
+            if "camera" in level_dict["protocol"]:
+                level_dict["recording_type"] = "camera"
+            elif "onix" in level_dict["protocol"]:
+                level_dict["recording_type"] = "ephys"
+            elif "harp" in level_dict["protocol"]:
+                level_dict["recording_type"] = "behaviour"
+            else:
+                level_dict["recording_type"] = "NOT SPECIFIED"
     elif re.fullmatch(r"S\d*", level_name):
         if "type" in level_dict:
             assert (

From 1f5cdc43046f7b3c5f46d3a1547251491276f5a0 Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Tue, 9 Jan 2024 13:58:10 +0000
Subject: [PATCH 42/73] [feature] add a visstim dataset

To be able to decouple harp and visual stimulation

TODO: allow multiple dataset per folder? Or make the parent folder the dataset instead of a recording
---
 flexiznam/schema/__init__.py                  |   2 +
 flexiznam/schema/visstim_data.py              | 139 ++++++++++++++++++
 .../tests_schema/test_visstim.py              |  24 +++
 3 files changed, 165 insertions(+)
 create mode 100644 flexiznam/schema/visstim_data.py
 create mode 100644 tests/test_components/tests_schema/test_visstim.py

diff --git a/flexiznam/schema/__init__.py b/flexiznam/schema/__init__.py
index c5ba526..d922e26 100644
--- a/flexiznam/schema/__init__.py
+++ b/flexiznam/schema/__init__.py
@@ -29,6 +29,7 @@
 from .microscopy_data import MicroscopyData
 from .onix_data import OnixData
 from .sequencing_data import SequencingData
+from .visstim_data import VisStimData
 
 Dataset.SUBCLASSES["camera"] = CameraData
 Dataset.SUBCLASSES["harp"] = HarpData
@@ -36,3 +37,4 @@
 Dataset.SUBCLASSES["microscopy"] = MicroscopyData
 Dataset.SUBCLASSES["onix"] = OnixData
 Dataset.SUBCLASSES["sequencing"] = SequencingData
+Dataset.SUBCLASSES["visstim"] = VisStimData
diff --git a/flexiznam/schema/visstim_data.py b/flexiznam/schema/visstim_data.py
new file mode 100644
index 0000000..d7084f4
--- /dev/null
+++ b/flexiznam/schema/visstim_data.py
@@ -0,0 +1,139 @@
+import datetime
+import os
+import pathlib
+import re
+
+from flexiznam.schema.datasets import Dataset
+
+
+class VisStimData(Dataset):
+    DATASET_TYPE = "visstim"
+
+    @classmethod
+    def from_folder(
+        cls,
+        folder,
+        folder_genealogy=None,
+        is_raw=None,
+        verbose=True,
+        flexilims_session=None,
+        project=None,
+    ):
+        """Create a visual stimulation dataset by loading info from folder
+
+        A visual stimulation dataset is a folder containing at least a `FrameLog.csv` 
+        file and any number of other associated csvs.
+
+        Args:
+            folder (str): path to the folder
+            folder_genealogy (tuple): genealogy of the folder, if None assume that
+                                      the genealogy is just (folder,), i.e. no parents
+            is_raw (bool): does this folder contain raw data?
+            verbose (bool=True): print info about what is found
+            flexilims_session (flm.Session): session to interact with flexilims
+            project (str): project ID or name
+
+        Returns:
+            dict of dataset (flz.schema.harp_data.HarpData)
+        """
+
+        csv_files = list(pathlib.Path(folder).glob("*.csv"))
+        
+        fnames = [f.name for f in csv_files]
+        if 'framelog.csv' not in [f.lower() for f in fnames]:
+            raise IOError("Cannot find FrameLog.csv file")
+        
+        log_file = [f for f in csv_files if f.name.lower() == 'framelog.csv'][0]
+        if verbose:
+            print(f"Found FrameLog.csv file: {log_file}")
+            
+        if folder_genealogy is None:
+            folder_genealogy = (pathlib.Path(folder).stem,)
+        elif isinstance(folder_genealogy, list):
+            folder_genealogy = tuple(folder_genealogy)
+        output = {}
+        extra_attributes = dict(csv_files={f.stem: f.name for f in csv_files})
+        genealogy = folder_genealogy + ("visstim",)
+        created = datetime.datetime.fromtimestamp(log_file.stat().st_mtime)
+        output["visstim"] = VisStimData(
+            genealogy=genealogy,
+            is_raw=is_raw,
+            path=folder,
+            extra_attributes=extra_attributes,
+            created=created.strftime("%Y-%m-%d %H:%M:%S"),
+            flexilims_session=flexilims_session,
+            project=project,
+        )
+        return output
+
+    def __init__(
+        self,
+        path,
+        is_raw=None,
+        genealogy=None,
+        extra_attributes=None,
+        created=None,
+        project=None,
+        project_id=None,
+        origin_id=None,
+        id=None,
+        flexilims_session=None,
+    ):
+        """Create a VisStim dataset
+
+        Args:
+            path: folder containing the dataset or path to file (valid only for single
+                  file datasets)
+            is_raw: bool, used to sort in raw and processed subfolders
+            genealogy (tuple): parents of this dataset from the project (excluded) down to
+                               the dataset name itself (included)
+            extra_attributes: dict, optional attributes.
+            created: Creation date, in "YYYY-MM-DD HH:mm:SS"
+            project: name of the project. Must be in config, can be guessed from
+                     project_id
+            project_id: hexadecimal code for the project. Must be in config, can be
+                        guessed from project
+            origin_id: hexadecimal code for the origin on flexilims.
+            id: hexadecimal code for the dataset on flexilims.
+            flexilims_session: authentication session to connect to flexilims
+
+        Expected extra_attributes:
+            csv_files (optional): Dictionary of csv files associated to the binary file.
+                                  Keys are identifier provided for convenience,
+                                  values are the full file name
+        """
+
+        super().__init__(
+            genealogy=genealogy,
+            path=path,
+            is_raw=is_raw,
+            dataset_type=VisStimData.DATASET_TYPE,
+            extra_attributes=extra_attributes,
+            created=created,
+            project=project,
+            project_id=project_id,
+            origin_id=origin_id,
+            id=id,
+            flexilims_session=flexilims_session,
+        )
+
+    @property
+    def csv_files(self):
+        return self.extra_attributes.get("csv_files", None)
+
+    @csv_files.setter
+    def csv_files(self, value):
+        self.extra_attributes["csv_files"] = str(value)
+
+    def is_valid(self, return_reason=False):
+        """Check that all csv files exist
+
+        Args:
+            return_reason (bool): if True, return a string with the reason why the
+                                  dataset is not valid
+        Returns:"""
+        for _, file_path in self.csv_files.items():
+            if not (self.path_full / file_path).exists():
+                msg = f"Missing file {file_path}"
+                return msg if return_reason else False
+        return  "" if return_reason else True
\ No newline at end of file
diff --git a/tests/test_components/tests_schema/test_visstim.py b/tests/test_components/tests_schema/test_visstim.py
new file mode 100644
index 0000000..f5d6507
--- /dev/null
+++ b/tests/test_components/tests_schema/test_visstim.py
@@ -0,0 +1,24 @@
+import pytest
+from flexiznam.schema.visstim_data import VisStimData
+from tests.tests_resources.data_for_testing import DATA_ROOT
+
+
+def test_vistim():
+    folder_genealogy = ["mouse_onix", "S20230915", "R165222_SpheresPermTubeReward"]
+    data_dir = DATA_ROOT.joinpath(*folder_genealogy)
+    ds = VisStimData.from_folder(data_dir, verbose=False)
+    assert len(ds) == 1
+    ds_name = "visstim"
+    d = ds[ds_name]
+    assert d.full_name == folder_genealogy[-1] + "_" + ds_name
+    d.project = "demo_project"
+    assert d.is_valid()
+    assert len(d.csv_files) == 4
+    ds = VisStimData.from_folder(
+        data_dir, verbose=False, folder_genealogy=folder_genealogy
+    )
+    d = ds[ds_name]
+    d.project = "demo_project"
+    assert d.full_name == "_".join(folder_genealogy + [ds_name])
+    assert d.is_valid()
+    assert len(d.csv_files) == 4

From 45187277e931076fd047715f956583e1283652d8 Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Tue, 30 Jan 2024 17:54:20 +0000
Subject: [PATCH 43/73] [feature] add conflicts to add_mouse

---
 CHANGELOG.md      |  1 +
 flexiznam/main.py | 32 ++++++++++++++++++++++++--------
 2 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 122fa2c..c979a8b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,7 @@
 
 ### Minor
 - `add_mouse` uploads birth and death dates in a human readable format instead.
+- Add `conflicts` argument to `add_mouse` to overwrite existing mice
 - `get_entities` does not raise warnings anymore if `name` is specified and `datatype` 
 is not. This is now supported upstream by `flexilims`
 
diff --git a/flexiznam/main.py b/flexiznam/main.py
index e7c94be..daac58c 100755
--- a/flexiznam/main.py
+++ b/flexiznam/main.py
@@ -139,6 +139,7 @@ def add_mouse(
     mcms_password=None,
     flexilims_username=None,
     flexilims_password=None,
+    conflicts="abort",
 ):
     """Check if a mouse is already in the database and add it if it isn't
 
@@ -162,6 +163,8 @@ def add_mouse(
                                   flexilims session is not provided
         flexilims_password (str): [optional] password for flexilims, used only if
                                   flexilims session is not provided
+        conflicts (str): `abort`, `skip`, `update` or `overwrite` (see update_entity for
+                        detailed description)
 
     Returns (dict):
         flexilims reply
@@ -175,8 +178,14 @@ def add_mouse(
 
     mice_df = get_entities(flexilims_session=flexilims_session, datatype="mouse")
     if mouse_name in mice_df.index:
-        print("Mouse already online")
-        return mice_df.loc[mouse_name]
+        if conflicts.lower() == "skip":
+            print("Mouse already online")
+            return mice_df.loc[mouse_name]
+        elif conflicts.lower() == "abort":
+            raise FlexilimsError("Mouse already online")
+        is_online = True
+    else:
+        is_online = False
 
     if mouse_info is None:
         mouse_info = {}
@@ -222,12 +231,19 @@ def add_mouse(
     mouse_info["genealogy"] = [mouse_name]
     project_name = lookup_project(flexilims_session.project_id, PARAMETERS)
     mouse_info["path"] = str(Path(project_name) / mouse_name)
-    resp = flexilims_session.post(
-        datatype="mouse",
-        name=mouse_name,
-        attributes=mouse_info,
-        strict_validation=False,
-    )
+    if is_online:
+        resp = update_entity(datatype='mouse',
+                name=mouse_name,
+                mode=conflicts,
+                attributes=mouse_info,
+                flexilims_session=flexilims_session)
+    else:
+        resp = flexilims_session.post(
+            datatype="mouse",
+            name=mouse_name,
+            attributes=mouse_info,
+            strict_validation=False,
+        )
     return resp
 
 

From 49637217749e17ba36852334afcfd4130092b00b Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Thu, 1 Feb 2024 11:06:40 +0000
Subject: [PATCH 44/73] [minor] improve error message

---
 CHANGELOG.md      |  1 +
 flexiznam/mcms.py | 10 +++++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c979a8b..6611da1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,6 +15,7 @@
 - Add `conflicts` argument to `add_mouse` to overwrite existing mice
 - `get_entities` does not raise warnings anymore if `name` is specified and `datatype` 
 is not. This is now supported upstream by `flexilims`
+- Clearer error message when mouse info cannot be found in MCMS
 
 ### Bugfixes
 
diff --git a/flexiznam/mcms.py b/flexiznam/mcms.py
index 2976375..24c23ee 100644
--- a/flexiznam/mcms.py
+++ b/flexiznam/mcms.py
@@ -1,8 +1,8 @@
 import re
 import pandas as pd
-from pymcms.main import McmsSession
+from requests.exceptions import InvalidURL
 from flexiznam.config import PARAMETERS, get_password
-
+from pymcms.main import McmsSession
 
 def get_mouse_info(mouse_name, username, password=None):
     """Load mouse info from mcms in a dataframe
@@ -18,7 +18,11 @@ def get_mouse_info(mouse_name, username, password=None):
     if password is None:
         password = get_password(username=username, app="mcms")
     mcms_sess = McmsSession(username=username, password=password)
-    original_data = mcms_sess.get_animal(name=mouse_name)
+    try:
+        original_data = mcms_sess.get_animal(name=mouse_name)
+    except InvalidURL:
+        raise InvalidURL(f"Mouse {mouse_name} not found under your PPL")
+
     # convert to camel case for flexlilims
     mouse_data = {}
     pattern = re.compile(r"(?<!^)(?=[A-Z])")

From e2d251ad3e78b6eef783b361770dec60e976dc21 Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Thu, 29 Feb 2024 19:12:02 +0000
Subject: [PATCH 45/73] [bugfix] issue in update_flexilims skip

In skip mode the dataset name was wrong
---
 flexiznam/schema/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flexiznam/schema/datasets.py b/flexiznam/schema/datasets.py
index 6ea04b9..548cd8a 100644
--- a/flexiznam/schema/datasets.py
+++ b/flexiznam/schema/datasets.py
@@ -414,7 +414,7 @@ def update_flexilims(self, mode="safe"):
                 resp = flz.update_entity(
                     datatype="dataset",
                     id=self.id,
-                    name=self.full_name,
+                    name=self.dataset_name,
                     origin_id=self.origin_id,
                     mode=mode,
                     attributes=attributes,

From 4913451d5fc15912fa454f84db3e0f89e0c148d2 Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Thu, 29 Feb 2024 19:12:40 +0000
Subject: [PATCH 46/73] [style] black

---
 flexiznam/main.py                | 12 +++++++-----
 flexiznam/mcms.py                |  1 +
 flexiznam/schema/visstim_data.py | 14 +++++++-------
 3 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/flexiznam/main.py b/flexiznam/main.py
index daac58c..e13bfdf 100755
--- a/flexiznam/main.py
+++ b/flexiznam/main.py
@@ -232,11 +232,13 @@ def add_mouse(
     project_name = lookup_project(flexilims_session.project_id, PARAMETERS)
     mouse_info["path"] = str(Path(project_name) / mouse_name)
     if is_online:
-        resp = update_entity(datatype='mouse',
-                name=mouse_name,
-                mode=conflicts,
-                attributes=mouse_info,
-                flexilims_session=flexilims_session)
+        resp = update_entity(
+            datatype="mouse",
+            name=mouse_name,
+            mode=conflicts,
+            attributes=mouse_info,
+            flexilims_session=flexilims_session,
+        )
     else:
         resp = flexilims_session.post(
             datatype="mouse",
diff --git a/flexiznam/mcms.py b/flexiznam/mcms.py
index 24c23ee..8e22bce 100644
--- a/flexiznam/mcms.py
+++ b/flexiznam/mcms.py
@@ -4,6 +4,7 @@
 from flexiznam.config import PARAMETERS, get_password
 from pymcms.main import McmsSession
 
+
 def get_mouse_info(mouse_name, username, password=None):
     """Load mouse info from mcms in a dataframe
 
diff --git a/flexiznam/schema/visstim_data.py b/flexiznam/schema/visstim_data.py
index d7084f4..b3a179f 100644
--- a/flexiznam/schema/visstim_data.py
+++ b/flexiznam/schema/visstim_data.py
@@ -21,7 +21,7 @@ def from_folder(
     ):
         """Create a visual stimulation dataset by loading info from folder
 
-        A visual stimulation dataset is a folder containing at least a `FrameLog.csv` 
+        A visual stimulation dataset is a folder containing at least a `FrameLog.csv`
         file and any number of other associated csvs.
 
         Args:
@@ -38,15 +38,15 @@ def from_folder(
         """
 
         csv_files = list(pathlib.Path(folder).glob("*.csv"))
-        
+
         fnames = [f.name for f in csv_files]
-        if 'framelog.csv' not in [f.lower() for f in fnames]:
+        if "framelog.csv" not in [f.lower() for f in fnames]:
             raise IOError("Cannot find FrameLog.csv file")
-        
-        log_file = [f for f in csv_files if f.name.lower() == 'framelog.csv'][0]
+
+        log_file = [f for f in csv_files if f.name.lower() == "framelog.csv"][0]
         if verbose:
             print(f"Found FrameLog.csv file: {log_file}")
-            
+
         if folder_genealogy is None:
             folder_genealogy = (pathlib.Path(folder).stem,)
         elif isinstance(folder_genealogy, list):
@@ -136,4 +136,4 @@ def is_valid(self, return_reason=False):
             if not (self.path_full / file_path).exists():
                 msg = f"Missing file {file_path}"
                 return msg if return_reason else False
-        return  "" if return_reason else True
\ No newline at end of file
+        return "" if return_reason else True

From a67cbde1e24b48a37c213a3abdc5e9e870be3117 Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Thu, 29 Feb 2024 19:21:40 +0000
Subject: [PATCH 47/73] [style] ruff to remove unused imports

---
 flexiznam/camp/sync_data.py                               | 8 +-------
 flexiznam/cli.py                                          | 1 -
 flexiznam/gui/flexigui.py                                 | 1 -
 flexiznam/main.py                                         | 4 ++--
 flexiznam/mcms.py                                         | 2 +-
 flexiznam/schema/datasets.py                              | 2 --
 flexiznam/schema/sequencing_data.py                       | 2 --
 flexiznam/schema/visstim_data.py                          | 2 --
 tests/test_2p.py                                          | 1 -
 tests/test_barseq.py                                      | 1 -
 tests/test_components/test_cli.py                         | 1 -
 tests/test_components/test_main.py                        | 2 +-
 tests/test_components/test_utils.py                       | 1 -
 tests/test_components/tests_schema/test_camera_data.py    | 1 -
 tests/test_components/tests_schema/test_harp.py           | 1 -
 .../test_components/tests_schema/test_microscopy_data.py  | 1 -
 tests/test_components/tests_schema/test_scanimage_data.py | 1 -
 .../test_components/tests_schema/test_sequencing_data.py  | 3 +--
 tests/test_components/tests_schema/test_visstim.py        | 1 -
 tests/tests_resources/data_for_testing.py                 | 3 +--
 20 files changed, 7 insertions(+), 32 deletions(-)

diff --git a/flexiznam/camp/sync_data.py b/flexiznam/camp/sync_data.py
index fd6139a..8b3aa3a 100644
--- a/flexiznam/camp/sync_data.py
+++ b/flexiznam/camp/sync_data.py
@@ -1,19 +1,13 @@
 """File to handle acquisition yaml file and create datasets on flexilims"""
-import os
 import pathlib
 from pathlib import Path, PurePosixPath
 import re
-import copy
 import warnings
 import pandas as pd
 import yaml
-from yaml.parser import ParserError
 
 import flexiznam as flz
-from flexiznam.errors import SyncYmlError, FlexilimsError
 from flexiznam.schema import Dataset
-from flexiznam.config import PARAMETERS
-from flexiznam.utils import clean_recursively
 
 
 def create_yaml(folder_to_parse, project, origin_name, output_file, overwrite=False):
@@ -526,7 +520,7 @@ def _check_recursively(
                 print(f"Fixing genealogy for {child}")
                 child_dict["genealogy"] = origin_genealogy + child_genealogy
             else:
-                child_dict["GENEALOGY_ERROR"] = f"XXERRORXX genealogy is not correct"
+                child_dict["GENEALOGY_ERROR"] = "XXERRORXX genealogy is not correct"
                 errors[fname] = child_dict
         if "children" in child_dict:
             _check_recursively(
diff --git a/flexiznam/cli.py b/flexiznam/cli.py
index ae0bf53..61e97d7 100644
--- a/flexiznam/cli.py
+++ b/flexiznam/cli.py
@@ -319,7 +319,6 @@ def check_flexilims_issues(
     """
     from flexiznam.main import get_flexilims_session
     from flexiznam import utils
-    import pathlib
     import pandas as pd
 
     flexilims_session = get_flexilims_session(
diff --git a/flexiznam/gui/flexigui.py b/flexiznam/gui/flexigui.py
index bad2b49..eb9b3a8 100644
--- a/flexiznam/gui/flexigui.py
+++ b/flexiznam/gui/flexigui.py
@@ -1,6 +1,5 @@
 import os
 import tkinter as tk
-from tkinter import ttk
 from ttkwidgets import CheckboxTreeview
 import yaml
 from pathlib import Path
diff --git a/flexiznam/main.py b/flexiznam/main.py
index e13bfdf..1530d98 100755
--- a/flexiznam/main.py
+++ b/flexiznam/main.py
@@ -9,8 +9,8 @@
 import flexiznam
 import yaml
 from flexiznam import mcms
-from flexiznam.config import PARAMETERS, get_password, add_password
-from flexiznam.errors import NameNotUniqueError, FlexilimsError, ConfigurationError
+from flexiznam.config import PARAMETERS, get_password
+from flexiznam.errors import NameNotUniqueError, FlexilimsError
 
 
 warnings.simplefilter("always", DeprecationWarning)
diff --git a/flexiznam/mcms.py b/flexiznam/mcms.py
index 8e22bce..8d756b7 100644
--- a/flexiznam/mcms.py
+++ b/flexiznam/mcms.py
@@ -1,7 +1,7 @@
 import re
 import pandas as pd
 from requests.exceptions import InvalidURL
-from flexiznam.config import PARAMETERS, get_password
+from flexiznam.config import get_password
 from pymcms.main import McmsSession
 
 
diff --git a/flexiznam/schema/datasets.py b/flexiznam/schema/datasets.py
index 548cd8a..3f08709 100644
--- a/flexiznam/schema/datasets.py
+++ b/flexiznam/schema/datasets.py
@@ -1,9 +1,7 @@
 import pathlib
 from datetime import datetime
 from pathlib import Path, PurePosixPath
-import numpy as np
 import pandas as pd
-from flexilims.utils import check_flexilims_validity
 import flexiznam as flz
 from flexiznam import utils
 from flexiznam.errors import FlexilimsError, DatasetError
diff --git a/flexiznam/schema/sequencing_data.py b/flexiznam/schema/sequencing_data.py
index 0e6f972..ca453b4 100644
--- a/flexiznam/schema/sequencing_data.py
+++ b/flexiznam/schema/sequencing_data.py
@@ -1,7 +1,5 @@
 import datetime
-import os
 import pathlib
-import re
 import warnings
 
 from flexiznam.schema.datasets import Dataset
diff --git a/flexiznam/schema/visstim_data.py b/flexiznam/schema/visstim_data.py
index b3a179f..fe0ff8e 100644
--- a/flexiznam/schema/visstim_data.py
+++ b/flexiznam/schema/visstim_data.py
@@ -1,7 +1,5 @@
 import datetime
-import os
 import pathlib
-import re
 
 from flexiznam.schema.datasets import Dataset
 
diff --git a/tests/test_2p.py b/tests/test_2p.py
index bcf476c..01dbd1d 100644
--- a/tests/test_2p.py
+++ b/tests/test_2p.py
@@ -20,7 +20,6 @@
     TEST_PROJECT,
 )
 import flexiznam as fzn
-from flexiznam import camp
 
 MOUSE = "mouse_physio_2p"
 SESSION = "S20211102"
diff --git a/tests/test_barseq.py b/tests/test_barseq.py
index 7863fca..fd34425 100644
--- a/tests/test_barseq.py
+++ b/tests/test_barseq.py
@@ -17,7 +17,6 @@
     TEST_PROJECT,
 )
 import flexiznam as fzn
-from flexiznam import camp
 
 MOUSE = "mouse_barseq"
 YAML = "yaml_automatic_skeleton.yml"
diff --git a/tests/test_components/test_cli.py b/tests/test_components/test_cli.py
index 2dc0ccb..a1e5f36 100644
--- a/tests/test_components/test_cli.py
+++ b/tests/test_components/test_cli.py
@@ -1,4 +1,3 @@
-import pytest
 import pathlib
 import yaml
 from click.testing import CliRunner
diff --git a/tests/test_components/test_main.py b/tests/test_components/test_main.py
index 252a588..949049e 100644
--- a/tests/test_components/test_main.py
+++ b/tests/test_components/test_main.py
@@ -6,7 +6,7 @@
 import pytest
 import flexiznam as flz
 import yaml
-from flexiznam.config import PARAMETERS, get_password
+from flexiznam.config import PARAMETERS
 from flexiznam.errors import FlexilimsError, NameNotUniqueError
 from tests.tests_resources.data_for_testing import MOUSE_ID, SESSION
 
diff --git a/tests/test_components/test_utils.py b/tests/test_components/test_utils.py
index 9384073..1a37254 100644
--- a/tests/test_components/test_utils.py
+++ b/tests/test_components/test_utils.py
@@ -2,7 +2,6 @@
 import pytest
 import numpy as np
 from pathlib import Path
-import pandas as pd
 import tempfile
 from flexiznam.config import config_tools, DEFAULT_CONFIG
 from flexiznam import utils
diff --git a/tests/test_components/tests_schema/test_camera_data.py b/tests/test_components/tests_schema/test_camera_data.py
index 2c3a3d8..c691996 100644
--- a/tests/test_components/tests_schema/test_camera_data.py
+++ b/tests/test_components/tests_schema/test_camera_data.py
@@ -1,4 +1,3 @@
-import pytest
 from flexiznam.schema.camera_data import CameraData
 from flexiznam.schema.datasets import Dataset
 from tests.tests_resources.data_for_testing import DATA_ROOT, TEST_PROJECT
diff --git a/tests/test_components/tests_schema/test_harp.py b/tests/test_components/tests_schema/test_harp.py
index e31679f..2a0574a 100644
--- a/tests/test_components/tests_schema/test_harp.py
+++ b/tests/test_components/tests_schema/test_harp.py
@@ -1,4 +1,3 @@
-import pytest
 from flexiznam.schema.harp_data import HarpData
 from tests.tests_resources.data_for_testing import DATA_ROOT
 
diff --git a/tests/test_components/tests_schema/test_microscopy_data.py b/tests/test_components/tests_schema/test_microscopy_data.py
index 2eeb05d..4f3a6b1 100644
--- a/tests/test_components/tests_schema/test_microscopy_data.py
+++ b/tests/test_components/tests_schema/test_microscopy_data.py
@@ -1,4 +1,3 @@
-import pytest
 from flexiznam.schema.microscopy_data import MicroscopyData
 from tests.tests_resources.data_for_testing import DATA_ROOT
 
diff --git a/tests/test_components/tests_schema/test_scanimage_data.py b/tests/test_components/tests_schema/test_scanimage_data.py
index 675190a..eb07c24 100644
--- a/tests/test_components/tests_schema/test_scanimage_data.py
+++ b/tests/test_components/tests_schema/test_scanimage_data.py
@@ -1,4 +1,3 @@
-import pytest
 from flexiznam.schema.scanimage_data import ScanimageData
 from tests.tests_resources.data_for_testing import DATA_ROOT
 
diff --git a/tests/test_components/tests_schema/test_sequencing_data.py b/tests/test_components/tests_schema/test_sequencing_data.py
index e066926..d129f59 100644
--- a/tests/test_components/tests_schema/test_sequencing_data.py
+++ b/tests/test_components/tests_schema/test_sequencing_data.py
@@ -1,6 +1,5 @@
-import pytest
 from flexiznam.schema.sequencing_data import SequencingData
-from tests.tests_resources.data_for_testing import DATA_ROOT, PROJECT_ID
+from tests.tests_resources.data_for_testing import DATA_ROOT
 
 # Test creation of all dataset types.
 #
diff --git a/tests/test_components/tests_schema/test_visstim.py b/tests/test_components/tests_schema/test_visstim.py
index f5d6507..97b3f4a 100644
--- a/tests/test_components/tests_schema/test_visstim.py
+++ b/tests/test_components/tests_schema/test_visstim.py
@@ -1,4 +1,3 @@
-import pytest
 from flexiznam.schema.visstim_data import VisStimData
 from tests.tests_resources.data_for_testing import DATA_ROOT
 
diff --git a/tests/tests_resources/data_for_testing.py b/tests/tests_resources/data_for_testing.py
index dd6bb0a..d43a6ce 100644
--- a/tests/tests_resources/data_for_testing.py
+++ b/tests/tests_resources/data_for_testing.py
@@ -1,11 +1,10 @@
 """A list of file coming from one experiment"""
 from pathlib import Path
-import datetime
 from flexiznam.config import PARAMETERS
 
 
 MOUSE_ID = "6437dcb13ded9c65df142a12"  # actual physio2p mouse
-MOUSE_TEMP = "647a1aec7ddb34517470d3e6" # some random mouse where I can change data
+MOUSE_TEMP = "647a1aec7ddb34517470d3e6"  # some random mouse where I can change data
 TEST_PROJECT = "demo_project"
 PROJECT_ID = "610989f9a651ff0b6237e0f6"
 SESSION = "mouse_physio_2p_S20211102"

From bd55ca347bff2d5a6acf0fa3a24de34800d4e95e Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Thu, 29 Feb 2024 19:41:57 +0000
Subject: [PATCH 48/73] [bugfix] bad dataset name when uploading with skip

---
 flexiznam/schema/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flexiznam/schema/datasets.py b/flexiznam/schema/datasets.py
index 3f08709..8f4839a 100644
--- a/flexiznam/schema/datasets.py
+++ b/flexiznam/schema/datasets.py
@@ -434,7 +434,7 @@ def update_flexilims(self, mode="safe"):
             path=str(PurePosixPath(self.path)),
             is_raw="yes" if self.is_raw else "no",
             project_id=self.project_id,
-            dataset_name=self.full_name,
+            dataset_name=self.dataset_name,
             attributes=attributes,
             flexilims_session=self.flexilims_session,
             conflicts="abort",

From e0c464674c333e9f902e54ffcdd5244b4456bebb Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Fri, 1 Mar 2024 16:27:44 +0000
Subject: [PATCH 49/73] Revert "[bugfix] issue in update_flexilims skip"

This reverts commit e2d251ad3e78b6eef783b361770dec60e976dc21.
---
 flexiznam/schema/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flexiznam/schema/datasets.py b/flexiznam/schema/datasets.py
index 8f4839a..33eb04c 100644
--- a/flexiznam/schema/datasets.py
+++ b/flexiznam/schema/datasets.py
@@ -412,7 +412,7 @@ def update_flexilims(self, mode="safe"):
                 resp = flz.update_entity(
                     datatype="dataset",
                     id=self.id,
-                    name=self.dataset_name,
+                    name=self.full_name,
                     origin_id=self.origin_id,
                     mode=mode,
                     attributes=attributes,

From ad727fb9d7b05dd58c2177de94fba8920c7890a4 Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Mon, 4 Mar 2024 17:08:13 +0000
Subject: [PATCH 50/73] [minor] option to print where config comes from

---
 flexiznam/config/config_tools.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/flexiznam/config/config_tools.py b/flexiznam/config/config_tools.py
index bf2d373..1234d57 100644
--- a/flexiznam/config/config_tools.py
+++ b/flexiznam/config/config_tools.py
@@ -53,12 +53,24 @@ def _find_file(file_name, config_folder=None, create_if_missing=False):
     raise ConfigurationError("Cannot find %s" % file_name)
 
 
-def load_param(param_folder=None, config_file="config.yml"):
-    """Read parameter file from config folder"""
+def load_param(param_folder=None, config_file="config.yml", verbose=False):
+    """Read parameter file from config folder
+    
+    Args:
+        param_folder (str, optional): folder to look for the file. Defaults to None.
+        config_file (str, optional): name of the file to find. Defaults to "config.yml".
+        verbose (bool, optional): if True, print the path of the file being read. 
+            Defaults to False.
+    
+    Returns:
+        dict: parameters read from the file
+    """
     if param_folder is None:
         param_file = _find_file(config_file)
     else:
         param_file = Path(param_folder) / config_file
+    if verbose:
+        print(f"Reading parameters from {param_file}")
     with open(param_file, "r") as yml_file:
         prm = yaml.safe_load(yml_file)
     return prm

From 53e2d8525d282d70260b396dcd7908235dfe7a4e Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Mon, 11 Mar 2024 07:45:14 +0000
Subject: [PATCH 51/73] [minor] describe change in changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6611da1..c3cf7c8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,6 +16,7 @@
 - `get_entities` does not raise warnings anymore if `name` is specified and `datatype` 
 is not. This is now supported upstream by `flexilims`
 - Clearer error message when mouse info cannot be found in MCMS
+- `load_param` can print the file used to read config with the `verbose` flag.
 
 ### Bugfixes
 

From 5b3248a41f0b20e031beb2695108db80aa2e7025 Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Wed, 1 May 2024 13:19:33 +0100
Subject: [PATCH 52/73] [style] run precommit

---
 .github/workflows/tests.yml            |   2 +-
 .pre-commit-config.yaml                |  28 +++++
 .vscode/settings.json                  |   2 +-
 CHANGELOG.md                           |  52 +++++-----
 README.md                              |   4 +-
 docs/make.bat                          |  70 ++++++-------
 docs/source/conf.py                    | 136 ++++++++++++-------------
 docs/source/flexiznam.camp.rst         |   2 +-
 flexiznam/config/config_tools.py       |   8 +-
 flexiznam/gui/azure.tcl                |   4 +-
 notebooks/01-Setup.ipynb               |  11 +-
 notebooks/02-Add Data.ipynb            |   7 +-
 notebooks/03-Using the database.ipynb  |   7 +-
 requirements.txt                       |  14 +--
 temp.py                                |  14 +++
 tests/ReadMe.md                        |  24 ++---
 tests/test-results/pytest_in_tests.xml |   2 +-
 17 files changed, 209 insertions(+), 178 deletions(-)
 create mode 100644 .pre-commit-config.yaml
 create mode 100644 temp.py

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 7e74ab0..0fc460c 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -13,7 +13,7 @@ jobs:
   build:
 
     runs-on: ubuntu-latest
-    environment: 
+    environment:
      name: testing
     strategy:
       matrix:
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..b9d55f8
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,28 @@
+exclude: 'conf.py'
+
+# Configuring https://pre-commit.ci/
+ci:
+    autoupdate_schedule: monthly
+
+repos:
+    - repo: https://github.com/pre-commit/pre-commit-hooks
+      rev: v4.4.0
+      hooks:
+          - id: check-docstring-first
+          - id: check-executables-have-shebangs
+          - id: check-merge-conflict
+          - id: check-toml
+          - id: end-of-file-fixer
+          - id: mixed-line-ending
+            args: [--fix=lf]
+          - id: requirements-txt-fixer
+          - id: trailing-whitespace
+    - repo: https://github.com/psf/black
+      rev: 23.3.0
+      hooks:
+          - id: black
+    - repo: https://github.com/kynan/nbstripout
+      rev: 0.6.1
+      hooks:
+          - id: nbstripout
+            args: [--extra-keys=metadata.language_info.version metadata.kernelspec.name metadata.kernelspec.display_name]
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 06d557d..1c98995 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -10,4 +10,4 @@
     ],
     "python.testing.unittestEnabled": false,
     "python.testing.pytestEnabled": true,
-}
\ No newline at end of file
+}
diff --git a/CHANGELOG.md b/CHANGELOG.md
index f3e82cb..d312328 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,7 +13,7 @@
 ### Minor
 - `add_mouse` uploads birth and death dates in a human readable format instead.
 - Add `conflicts` argument to `add_mouse` to overwrite existing mice
-- `get_entities` does not raise warnings anymore if `name` is specified and `datatype` 
+- `get_entities` does not raise warnings anymore if `name` is specified and `datatype`
 is not. This is now supported upstream by `flexilims`
 - Clearer error message when mouse info cannot be found in MCMS
 - `load_param` can print the file used to read config with the `verbose` flag.
@@ -35,13 +35,13 @@ config but not the local config) to the config file
 
 - Add `get_data_root` function to get `raw` or `processed` root for a project
 - `get_children` can filter children by attributes before returning results
-- refactor `get_datasets` to be non recursive and add filtering options. Also add 
+- refactor `get_datasets` to be non recursive and add filtering options. Also add
   multiple options to filter datasets and format output
 - add `get_datasets_recursively` to get all datasets below a given entity
 
 ### Bugfixes
 
-- return empty dataframe if `filter` in `get_children` filters out everything (instead 
+- return empty dataframe if `filter` in `get_children` filters out everything (instead
   of crashing)
 - `update_flexilims` correctly uploads tuples parameters
 - `update_flexilims` correctly uploads floats and np.float/np.int parameters
@@ -52,9 +52,9 @@ config but not the local config) to the config file
 ### Minor
 
 - `harp_dataset.from_folder` will now match csv even if there is nothing before or after
-  `harpmessage` in the file name (i.e. the file is `harpmessage.bin`, and all csvs in 
+  `harpmessage` in the file name (i.e. the file is `harpmessage.bin`, and all csvs in
   the folder will be matched)
-- private function `config_tools._find_files` has now  a `create_if_missing` argument to 
+- private function `config_tools._find_files` has now  a `create_if_missing` argument to
   create the file if it does not exist
 
 ## v0.3.7
@@ -72,24 +72,24 @@ config but not the local config) to the config file
 ## v0.3.5
 
 ### Main changes
-- `flz.get_datasets` can return `Dataset` objects instead of path strings if 
+- `flz.get_datasets` can return `Dataset` objects instead of path strings if
   `return_paths=False`
 - New `OnixData` class to handle Onix data
 - `get_flexilims_session` can now re-use token from a previous session
 - Add a GUI module.
 
 ### Minor
-- More generic `clean_recursively` replaces the `clean_dictionary_recursively`. It 
+- More generic `clean_recursively` replaces the `clean_dictionary_recursively`. It
   handle more complex nesting and replaces non finite float by their string repr.
 - `CameraDataset` metadata can also be `.yml`, not only `.txt`.
-- `Dataset.format(mode='yaml')` ensure yaml compatibility. (path to str, tuple to list, 
+- `Dataset.format(mode='yaml')` ensure yaml compatibility. (path to str, tuple to list,
   etc...)
 - `add_experimental_session` can be done with `parent_id` (or `parent_name`).
 - `add_dataset` can add a dataset to a mouse and does not require genealogy.
 
 
 ### Bugfixes
-- Fix [#68](https://github.com/znamlab/flexiznam/issues/68). Dataset.format returns 
+- Fix [#68](https://github.com/znamlab/flexiznam/issues/68). Dataset.format returns
   always the path in posix format.
 - Fix [#88](https://github.com/znamlab/flexiznam/issues/88). Now make attributes JSON
   compatible before uploading to flexilims. This will replace special characters in
@@ -124,7 +124,7 @@ config but not the local config) to the config file
 
 ### Bugfixes
 - `add_genealogy` now works with scanimage datasets
-- `HarpData` does not match csv if the file name is only `harpmessage.bin`. 
+- `HarpData` does not match csv if the file name is only `harpmessage.bin`.
   See issue #93
 - Adapt `add_mouse` to new MCMS page layout
 - `config --update` adds fields that are new in the default config to the current config
@@ -137,29 +137,29 @@ config but not the local config) to the config file
 ## v0.3.2
 
 ### Main changes
-- Add CLI function: `check_flexilims_issues` to check for ill-named entity and invalid 
+- Add CLI function: `check_flexilims_issues` to check for ill-named entity and invalid
   paths
-- `update_config` now adds all project_ids to the default config (requires to have 
+- `update_config` now adds all project_ids to the default config (requires to have
   flexilims access)
 
 ### Breaking changes:
 - `add_dataset` requires the genealogy argument
-- `from_folder` uses a `folder_genealogy` argument instead of the previous `mouse`, 
+- `from_folder` uses a `folder_genealogy` argument instead of the previous `mouse`,
   `session` and ` recording` arguments
 - `Dataset` creation requires `genealogy` instead of `name`
-- `Dataset` has now a `Dataset.full_name` and `Dataset.short_name` property instead 
-  of a `Dataset.name` 
+- `Dataset` has now a `Dataset.full_name` and `Dataset.short_name` property instead
+  of a `Dataset.name`
 
 ### Main changes
-- `from_origin` has a new `base_name` property to allow multiple datasets of the same 
+- `from_origin` has a new `base_name` property to allow multiple datasets of the same
   `dataset_type` below the same origin.
 
 ### Minor
-- `add_mouse` can be given a dictionary of info instead of reading them from MCMS (to 
+- `add_mouse` can be given a dictionary of info instead of reading them from MCMS (to
   allow for manual download)
-- `add_experimental_session` uses parent path as base path. It means that parent must 
+- `add_experimental_session` uses parent path as base path. It means that parent must
   have a path
-- `CameraData.from_folder` has an option to detect partial datasets (i.e. without 
+- `CameraData.from_folder` has an option to detect partial datasets (i.e. without
   timestamps or metadata)
 - Reduce default verbosity of some functions
 - `get_flexilims_sessions` can get a session without setting the project_id
@@ -171,21 +171,21 @@ config but not the local config) to the config file
 
 ### Main changes
 - Compatible with flexilims v0.2. `None` and `''` can both be uploaded.
-- Dataset.is_raw can be autodetermined from path. If this fails, it **must** be 
+- Dataset.is_raw can be autodetermined from path. If this fails, it **must** be
   manually set.
-- New function and CLI entry: `create_yaml` to create the skeleton of a yaml before 
+- New function and CLI entry: `create_yaml` to create the skeleton of a yaml before
   parsing.
 - Extensions for microscopy datasets are now defined in the config file.
 - ScanImage datasets have a `stack_type` attribute, default to `calcium`.
 - Authorise `overwrite` when adding samples, sessions, recordings, or datasets.
 - Add  `flz.utils.check_flexilims_path` to verify that defined paths actually exist.
-- Add `flz.utils.check_flexilims_names` to verify that entity names start with their 
+- Add `flz.utils.check_flexilims_names` to verify that entity names start with their
   parent's name.
-- Add `flz.utils.add_genealogy` to add a `genealogy` field to flexilims entries. This 
-  field contains the list of parents ([mouse, session, recording] for instance) up to 
+- Add `flz.utils.add_genealogy` to add a `genealogy` field to flexilims entries. This
+  field contains the list of parents ([mouse, session, recording] for instance) up to
   the short name of the current entity
-- Add `flz.utilis.add_missing_paths` to update flexilims to add `path` attribute to 
-  non-dataset entities that have a genealogy defined. The path is set to `project / 
+- Add `flz.utilis.add_missing_paths` to update flexilims to add `path` attribute to
+  non-dataset entities that have a genealogy defined. The path is set to `project /
   Path(*genealogy)` if this folder exists in the processed or raw root directory.
 
 ### Bugfixes:
diff --git a/README.md b/README.md
index 40a70f1..a8f9121 100644
--- a/README.md
+++ b/README.md
@@ -38,7 +38,7 @@ To set up the flexilims and mcms integration, the config file must be edited. Fi
 flexiznam config
 ```
 
-This should create a `~/.flexiznam/config.yml` file. Edit it with your favorite text editor to change `flexilims_username`, `mcms_username` and, 
+This should create a `~/.flexiznam/config.yml` file. Edit it with your favorite text editor to change `flexilims_username`, `mcms_username` and,
 if neeed `data_root`.
 
 You can then add passwords to make it simpler by running (one by one):
@@ -78,7 +78,7 @@ If you used `pip -e .` to install, updating can be done with:
 
 ```
 cd flexiznam
-git pull 
+git pull
 pip install -e . --upgrade
 flexiznam config --update
 ```
diff --git a/docs/make.bat b/docs/make.bat
index 6247f7e..9534b01 100644
--- a/docs/make.bat
+++ b/docs/make.bat
@@ -1,35 +1,35 @@
-@ECHO OFF
-
-pushd %~dp0
-
-REM Command file for Sphinx documentation
-
-if "%SPHINXBUILD%" == "" (
-	set SPHINXBUILD=sphinx-build
-)
-set SOURCEDIR=source
-set BUILDDIR=build
-
-if "%1" == "" goto help
-
-%SPHINXBUILD% >NUL 2>NUL
-if errorlevel 9009 (
-	echo.
-	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
-	echo.installed, then set the SPHINXBUILD environment variable to point
-	echo.to the full path of the 'sphinx-build' executable. Alternatively you
-	echo.may add the Sphinx directory to PATH.
-	echo.
-	echo.If you don't have Sphinx installed, grab it from
-	echo.http://sphinx-doc.org/
-	exit /b 1
-)
-
-%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-goto end
-
-:help
-%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-
-:end
-popd
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 3295c09..82d2a42 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -1,68 +1,68 @@
-# Configuration file for the Sphinx documentation builder.
-#
-# This file only contains a selection of the most common options. For a full
-# list see the documentation:
-# https://www.sphinx-doc.org/en/master/usage/configuration.html
-
-# -- Path setup --------------------------------------------------------------
-
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-#
-# import os
-# import sys
-# sys.path.insert(0, os.path.abspath('.'))
-
-import os
-import sys
-
-sys.path.insert(0, os.path.abspath("../.."))
-
-# -- Project information -----------------------------------------------------
-
-project = "flexiznam"
-copyright = "2021, Antonin Blot, Petr Znamenskiy"
-author = "Antonin Blot, Petr Znamenskiy"
-
-
-# -- General configuration ---------------------------------------------------
-
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
-# ones.
-extensions = [
-    "sphinx.ext.autodoc",
-    "sphinx.ext.autosectionlabel",
-    "sphinx.ext.intersphinx",
-    "sphinx.ext.napoleon",
-    "sphinx.ext.viewcode",
-    "sphinx_click",
-]
-
-intersphinx_mapping = {
-    "python": ("https://docs.python.org/3", None),
-    "pandas": ("https://pandas.pydata.org/docs/", None),
-}
-
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ["_templates"]
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-# This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = []
-
-
-# -- Options for HTML output -------------------------------------------------
-
-# The theme to use for HTML and HTML Help pages.  See the documentation for
-# a list of builtin themes.
-#
-html_theme = "sphinx_rtd_theme"
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ["_static"]
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+import os
+import sys
+
+sys.path.insert(0, os.path.abspath("../.."))
+
+# -- Project information -----------------------------------------------------
+
+project = "flexiznam"
+copyright = "2021, Antonin Blot, Petr Znamenskiy"
+author = "Antonin Blot, Petr Znamenskiy"
+
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    "sphinx.ext.autodoc",
+    "sphinx.ext.autosectionlabel",
+    "sphinx.ext.intersphinx",
+    "sphinx.ext.napoleon",
+    "sphinx.ext.viewcode",
+    "sphinx_click",
+]
+
+intersphinx_mapping = {
+    "python": ("https://docs.python.org/3", None),
+    "pandas": ("https://pandas.pydata.org/docs/", None),
+}
+
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ["_templates"]
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = []
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = "sphinx_rtd_theme"
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ["_static"]
diff --git a/docs/source/flexiznam.camp.rst b/docs/source/flexiznam.camp.rst
index 3acc855..1cf4650 100644
--- a/docs/source/flexiznam.camp.rst
+++ b/docs/source/flexiznam.camp.rst
@@ -8,7 +8,7 @@ Module contents
    :members:
    :undoc-members:
    :show-inheritance:
-   
+
 flexiznam.camp.sync\_data module
 --------------------------------
 
diff --git a/flexiznam/config/config_tools.py b/flexiznam/config/config_tools.py
index 1234d57..660a796 100644
--- a/flexiznam/config/config_tools.py
+++ b/flexiznam/config/config_tools.py
@@ -55,13 +55,13 @@ def _find_file(file_name, config_folder=None, create_if_missing=False):
 
 def load_param(param_folder=None, config_file="config.yml", verbose=False):
     """Read parameter file from config folder
-    
+
     Args:
         param_folder (str, optional): folder to look for the file. Defaults to None.
         config_file (str, optional): name of the file to find. Defaults to "config.yml".
-        verbose (bool, optional): if True, print the path of the file being read. 
+        verbose (bool, optional): if True, print the path of the file being read.
             Defaults to False.
-    
+
     Returns:
         dict: parameters read from the file
     """
@@ -167,7 +167,7 @@ def update_config(
             project_ids.update(kwargs["project_ids"])
         kwargs["project_ids"] = project_ids
         all_ids = {}
-        for (pname, pid) in kwargs["project_ids"].items():
+        for pname, pid in kwargs["project_ids"].items():
             if pid in all_ids:
                 warnings.warn(f"PIDs {pname} and {all_ids[pid]} have the same ID")
             all_ids[pid] = pname
diff --git a/flexiznam/gui/azure.tcl b/flexiznam/gui/azure.tcl
index 3e75502..fead545 100644
--- a/flexiznam/gui/azure.tcl
+++ b/flexiznam/gui/azure.tcl
@@ -17,7 +17,7 @@ proc set_theme {mode} {
             -selectfg       "#ffffff"
             -selectbg       "#007fff"
         }
-        
+
         ttk::style configure . \
             -background $colors(-bg) \
             -foreground $colors(-fg) \
@@ -44,7 +44,7 @@ proc set_theme {mode} {
 
         option add *font [ttk::style lookup . -font]
         option add *Menu.selectcolor $colors(-fg)
-    
+
 	} elseif {$mode == "light"} {
 		ttk::style theme use "azure-light"
 
diff --git a/notebooks/01-Setup.ipynb b/notebooks/01-Setup.ipynb
index ad0fdc0..f749f7d 100644
--- a/notebooks/01-Setup.ipynb
+++ b/notebooks/01-Setup.ipynb
@@ -139,9 +139,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "scrolled": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "!cat ~/.flexiznam/secret_password.yml"
@@ -172,9 +170,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
+   "language": "python"
   },
   "language_info": {
    "codemirror_mode": {
@@ -185,8 +181,7 @@
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "pygments_lexer": "ipython3"
   },
   "toc": {
    "base_numbering": 1,
diff --git a/notebooks/02-Add Data.ipynb b/notebooks/02-Add Data.ipynb
index 2b8524f..3b9e85d 100644
--- a/notebooks/02-Add Data.ipynb	
+++ b/notebooks/02-Add Data.ipynb	
@@ -241,9 +241,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
+   "language": "python"
   },
   "language_info": {
    "codemirror_mode": {
@@ -254,8 +252,7 @@
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "pygments_lexer": "ipython3"
   },
   "toc": {
    "base_numbering": 1,
diff --git a/notebooks/03-Using the database.ipynb b/notebooks/03-Using the database.ipynb
index d839c14..ad48e98 100644
--- a/notebooks/03-Using the database.ipynb	
+++ b/notebooks/03-Using the database.ipynb	
@@ -290,9 +290,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
+   "language": "python"
   },
   "language_info": {
    "codemirror_mode": {
@@ -303,8 +301,7 @@
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "pygments_lexer": "ipython3"
   },
   "toc": {
    "base_numbering": 1,
diff --git a/requirements.txt b/requirements.txt
index 21b31d0..ad16b0d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,12 +1,12 @@
-pytest
-setuptools
-pandas
-webbot
+black
 click
 git+ssh://git@github.com/znamlab/flexilims.git#egg=flexilims
+pandas
+pytest
 pyyaml
-tifffile
+setuptools
 sphinx
-sphinx-rtd-theme
 sphinx-click
-black
+sphinx-rtd-theme
+tifffile
+webbot
diff --git a/temp.py b/temp.py
new file mode 100644
index 0000000..872bbce
--- /dev/null
+++ b/temp.py
@@ -0,0 +1,14 @@
+from flexiznam.config import config_tools
+
+ymlfile = "/camp/home/blota/home/users/blota/temp/s20230605_valid.yml"
+from flexiznam.camp import sync_data as sd
+
+o = sd.parse_yaml(ymlfile)
+
+
+config_folder = None
+fname = config_tools._find_file("config.yml", config_folder=config_folder)
+prm = config_tools.load_param(param_folder=config_folder)
+config_tools.update_config(
+    param_file="config.yml", config_folder=config_folder, add_all_projects=True, **prm
+)
diff --git a/tests/ReadMe.md b/tests/ReadMe.md
index edaf613..10036bd 100644
--- a/tests/ReadMe.md
+++ b/tests/ReadMe.md
@@ -4,37 +4,37 @@
 
 Tests are separated in two:
 
-- Main use cases found in the main test folder 
+- Main use cases found in the main test folder
 - Test of individual components found in `test_components`
- 
-The `test_components` should cover most of the code but are not user friendly. The 
+
+The `test_components` should cover most of the code but are not user friendly. The
 main use cases are example scripts that could be use for a real experiment.
 
 ## Data
 
-Example datasets are available in the 
+Example datasets are available in the
 raw data folder on camp `data/instruments/raw_data/projects/demo_project/`.
 A corresponding preprocessed folder is also used by tests.
 
 ## Notes:
 
 ### MCMS
-To test the MCMS part, you need a graphical interface and a browser. It is also 
+To test the MCMS part, you need a graphical interface and a browser. It is also
 particularly slow.
 
-To avoid having to run it every time, the tests are marked as slow and require the 
+To avoid having to run it every time, the tests are marked as slow and require the
 `--runslow` flag to be executed. This is False by default
 
 ### Flexilims
-For interaction with flexilims, you need to be connected via the crick network 
-(vpn or from the crick). Neither is easily doable on github workflow. Furthermore 
-flexilims does not have an API to delete entries. You will have clean it manually 
+For interaction with flexilims, you need to be connected via the crick network
+(vpn or from the crick). Neither is easily doable on github workflow. Furthermore
+flexilims does not have an API to delete entries. You will have clean it manually
 before running the tests
 
-To make things simpler, the tests requiring flexilims or mcms are marked as integration 
+To make things simpler, the tests requiring flexilims or mcms are marked as integration
 tests. They can be skipped by running `pytest -m "not integtest"`.
 
-To test the upload to flexilims properly, you need to clear flexilims yourself 
-(as there is no API to delete stuff). There should be a flag `FLM_IS_WIPED` at 
+To test the upload to flexilims properly, you need to clear flexilims yourself
+(as there is no API to delete stuff). There should be a flag `FLM_IS_WIPED` at
 the beginning of each test file. If set to `False` (default), then tests involving
 flexilims will run with `conflicts=skip`.
diff --git a/tests/test-results/pytest_in_tests.xml b/tests/test-results/pytest_in_tests.xml
index e3838f1..d3c28e6 100644
--- a/tests/test-results/pytest_in_tests.xml
+++ b/tests/test-results/pytest_in_tests.xml
@@ -1 +1 @@
-<?xml version="1.0" encoding="utf-8"?><testsuites><testsuite name="pytest" errors="0" failures="0" skipped="0" tests="54" time="678.628" timestamp="2023-07-17T23:11:47.609985" hostname="FM70R4W0XG"><testcase classname="tests.test_2p" name="test_create_yaml" time="9.791" /><testcase classname="tests.test_2p" name="test_parse_yaml" time="4.791" /><testcase classname="tests.test_2p" name="test_flm" time="21.164" /><testcase classname="tests.test_barseq" name="test_create_yaml" time="2.402" /><testcase classname="tests.test_barseq" name="test_parse_yaml" time="11.728" /><testcase classname="tests.test_barseq" name="test_flm" time="38.502" /><testcase classname="tests.test_components.test_cli" name="test_config" time="0.264" /><testcase classname="tests.test_components.test_cli" name="test_add_password" time="0.028" /><testcase classname="tests.test_components.test_cli" name="test_create_yaml" time="0.801" /><testcase classname="tests.test_components.test_cli" name="test_make_full_yaml" time="10.921" /><testcase classname="tests.test_components.test_cli" name="test_upload" time="28.840" /><testcase classname="tests.test_components.test_cli" name="test_flm_issues" time="64.297" /><testcase classname="tests.test_components.test_main" name="test_get_path" time="0.212" /><testcase classname="tests.test_components.test_main" name="test_get_flexilims_session" time="0.276" /><testcase classname="tests.test_components.test_main" name="test_format_results" time="0.021" /><testcase classname="tests.test_components.test_main" name="test_get_experimental_sessions" time="0.709" /><testcase classname="tests.test_components.test_main" name="test_get_entities" time="1.006" /><testcase classname="tests.test_components.test_main" name="test_get_entity" time="0.073" /><testcase classname="tests.test_components.test_main" name="test_get_mouse_id" time="0.315" /><testcase classname="tests.test_components.test_main" name="test_get_datasets" time="11.690" /><testcase classname="tests.test_components.test_main" name="test_get_datasets_recursively" time="17.020" /><testcase classname="tests.test_components.test_main" name="test_add_mouse" time="4.284" /><testcase classname="tests.test_components.test_main" name="test_generate_name" time="3.180" /><testcase classname="tests.test_components.test_main" name="test_get_children" time="5.837" /><testcase classname="tests.test_components.test_main" name="test_add_entity" time="1.935" /><testcase classname="tests.test_components.test_main" name="test_update_entity" time="5.820" /><testcase classname="tests.test_components.test_mcms" name="test_get_mouse_df" time="0.676" /><testcase classname="tests.test_components.test_mcms" name="test_get_procedures" time="0.558" /><testcase classname="tests.test_components.test_utils" name="test_create_config" time="0.060" /><testcase classname="tests.test_components.test_utils" name="test_update_config" time="0.364" /><testcase classname="tests.test_components.test_utils" name="test_passwd_creation" time="0.063" /><testcase classname="tests.test_components.test_utils" name="test_check_flexilims_paths" time="72.021" /><testcase classname="tests.test_components.test_utils" name="test_check_flexilims_names" time="70.026" /><testcase classname="tests.test_components.test_utils" name="test_add_genealogy" time="149.076" /><testcase classname="tests.test_components.test_utils" name="test_clean_recursively" time="0.010" /><testcase classname="tests.test_components.test_utils" name="test_add_missing_paths" time="31.141" /><testcase classname="tests.test_components.test_utils" name="test_check_attribute" time="67.307" /><testcase classname="tests.test_components.tests_schema.test_camera_data" name="test_create_directly" time="0.009" /><testcase classname="tests.test_components.tests_schema.test_camera_data" name="test_create_from_folder" time="1.188" /><testcase classname="tests.test_components.tests_schema.test_camera_data" name="test_create_from_flexilims" time="1.107" /><testcase classname="tests.test_components.tests_schema.test_datasets" name="test_dataset" time="0.016" /><testcase classname="tests.test_components.tests_schema.test_datasets" name="test_constructor" time="0.021" /><testcase classname="tests.test_components.tests_schema.test_datasets" name="test_dataset_flexilims_integration" time="0.900" /><testcase classname="tests.test_components.tests_schema.test_datasets" name="test_from_flexilims" time="1.103" /><testcase classname="tests.test_components.tests_schema.test_datasets" name="test_from_dataseries" time="0.018" /><testcase classname="tests.test_components.tests_schema.test_datasets" name="test_from_origin" time="27.100" /><testcase classname="tests.test_components.tests_schema.test_datasets" name="test_update_flexilims" time="7.381" /><testcase classname="tests.test_components.tests_schema.test_datasets" name="test_dataset_paths" time="0.245" /><testcase classname="tests.test_components.tests_schema.test_datasets" name="test_project_project_id" time="0.013" /><testcase classname="tests.test_components.tests_schema.test_datasets" name="test_dataset_type_enforcer" time="0.011" /><testcase classname="tests.test_components.tests_schema.test_harp" name="test_harp" time="0.762" /><testcase classname="tests.test_components.tests_schema.test_microscopy_data" name="test_from_folder" time="0.107" /><testcase classname="tests.test_components.tests_schema.test_scanimage_data" name="test_scanimage" time="0.565" /><testcase classname="tests.test_components.tests_schema.test_sequencing_data" name="test_from_folder" time="0.628" /></testsuite></testsuites>
\ No newline at end of file
+<?xml version="1.0" encoding="utf-8"?><testsuites><testsuite name="pytest" errors="0" failures="0" skipped="0" tests="54" time="678.628" timestamp="2023-07-17T23:11:47.609985" hostname="FM70R4W0XG"><testcase classname="tests.test_2p" name="test_create_yaml" time="9.791" /><testcase classname="tests.test_2p" name="test_parse_yaml" time="4.791" /><testcase classname="tests.test_2p" name="test_flm" time="21.164" /><testcase classname="tests.test_barseq" name="test_create_yaml" time="2.402" /><testcase classname="tests.test_barseq" name="test_parse_yaml" time="11.728" /><testcase classname="tests.test_barseq" name="test_flm" time="38.502" /><testcase classname="tests.test_components.test_cli" name="test_config" time="0.264" /><testcase classname="tests.test_components.test_cli" name="test_add_password" time="0.028" /><testcase classname="tests.test_components.test_cli" name="test_create_yaml" time="0.801" /><testcase classname="tests.test_components.test_cli" name="test_make_full_yaml" time="10.921" /><testcase classname="tests.test_components.test_cli" name="test_upload" time="28.840" /><testcase classname="tests.test_components.test_cli" name="test_flm_issues" time="64.297" /><testcase classname="tests.test_components.test_main" name="test_get_path" time="0.212" /><testcase classname="tests.test_components.test_main" name="test_get_flexilims_session" time="0.276" /><testcase classname="tests.test_components.test_main" name="test_format_results" time="0.021" /><testcase classname="tests.test_components.test_main" name="test_get_experimental_sessions" time="0.709" /><testcase classname="tests.test_components.test_main" name="test_get_entities" time="1.006" /><testcase classname="tests.test_components.test_main" name="test_get_entity" time="0.073" /><testcase classname="tests.test_components.test_main" name="test_get_mouse_id" time="0.315" /><testcase classname="tests.test_components.test_main" name="test_get_datasets" time="11.690" /><testcase classname="tests.test_components.test_main" name="test_get_datasets_recursively" time="17.020" /><testcase classname="tests.test_components.test_main" name="test_add_mouse" time="4.284" /><testcase classname="tests.test_components.test_main" name="test_generate_name" time="3.180" /><testcase classname="tests.test_components.test_main" name="test_get_children" time="5.837" /><testcase classname="tests.test_components.test_main" name="test_add_entity" time="1.935" /><testcase classname="tests.test_components.test_main" name="test_update_entity" time="5.820" /><testcase classname="tests.test_components.test_mcms" name="test_get_mouse_df" time="0.676" /><testcase classname="tests.test_components.test_mcms" name="test_get_procedures" time="0.558" /><testcase classname="tests.test_components.test_utils" name="test_create_config" time="0.060" /><testcase classname="tests.test_components.test_utils" name="test_update_config" time="0.364" /><testcase classname="tests.test_components.test_utils" name="test_passwd_creation" time="0.063" /><testcase classname="tests.test_components.test_utils" name="test_check_flexilims_paths" time="72.021" /><testcase classname="tests.test_components.test_utils" name="test_check_flexilims_names" time="70.026" /><testcase classname="tests.test_components.test_utils" name="test_add_genealogy" time="149.076" /><testcase classname="tests.test_components.test_utils" name="test_clean_recursively" time="0.010" /><testcase classname="tests.test_components.test_utils" name="test_add_missing_paths" time="31.141" /><testcase classname="tests.test_components.test_utils" name="test_check_attribute" time="67.307" /><testcase classname="tests.test_components.tests_schema.test_camera_data" name="test_create_directly" time="0.009" /><testcase classname="tests.test_components.tests_schema.test_camera_data" name="test_create_from_folder" time="1.188" /><testcase classname="tests.test_components.tests_schema.test_camera_data" name="test_create_from_flexilims" time="1.107" /><testcase classname="tests.test_components.tests_schema.test_datasets" name="test_dataset" time="0.016" /><testcase classname="tests.test_components.tests_schema.test_datasets" name="test_constructor" time="0.021" /><testcase classname="tests.test_components.tests_schema.test_datasets" name="test_dataset_flexilims_integration" time="0.900" /><testcase classname="tests.test_components.tests_schema.test_datasets" name="test_from_flexilims" time="1.103" /><testcase classname="tests.test_components.tests_schema.test_datasets" name="test_from_dataseries" time="0.018" /><testcase classname="tests.test_components.tests_schema.test_datasets" name="test_from_origin" time="27.100" /><testcase classname="tests.test_components.tests_schema.test_datasets" name="test_update_flexilims" time="7.381" /><testcase classname="tests.test_components.tests_schema.test_datasets" name="test_dataset_paths" time="0.245" /><testcase classname="tests.test_components.tests_schema.test_datasets" name="test_project_project_id" time="0.013" /><testcase classname="tests.test_components.tests_schema.test_datasets" name="test_dataset_type_enforcer" time="0.011" /><testcase classname="tests.test_components.tests_schema.test_harp" name="test_harp" time="0.762" /><testcase classname="tests.test_components.tests_schema.test_microscopy_data" name="test_from_folder" time="0.107" /><testcase classname="tests.test_components.tests_schema.test_scanimage_data" name="test_scanimage" time="0.565" /><testcase classname="tests.test_components.tests_schema.test_sequencing_data" name="test_from_folder" time="0.628" /></testsuite></testsuites>

From d8056039ee3e3e90c76d8e8eb6659d18d3241920 Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Tue, 19 Mar 2024 16:06:42 +0000
Subject: [PATCH 53/73] [feature] crash `update_entity` if reserved fields are
 used as attributes.

---
 CHANGELOG.md                       |  7 +++++++
 flexiznam/main.py                  |  6 +++++-
 tests/test_components/test_main.py | 12 ++++++++++++
 3 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d312328..50807b4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -23,6 +23,13 @@ is not. This is now supported upstream by `flexilims`
 - `update_config` actually adds the new fields (i.e. fields that are in the default
 config but not the local config) to the config file
 
+
+## v0.3.10
+
+### Main changes
+
+- Make `update_entity` safer by crashing if reserved fields are used as attributes.
+
 ## v0.3.9
 
 ### Main changes
diff --git a/flexiznam/main.py b/flexiznam/main.py
index 6ee819f..6bf4c41 100755
--- a/flexiznam/main.py
+++ b/flexiznam/main.py
@@ -772,7 +772,11 @@ def update_entity(
         raise AttributeError("`mode` must be `overwrite` or `update`")
     if id is None:
         id = entity["id"]
-
+    for attr in full_attributes:
+        if attr in entity:
+            raise FlexilimsError(
+                "Attribute `%s` is a flexilims reserved keyword" % attr
+            )
     rep = flexilims_session.update_one(
         id=id,
         datatype=datatype,
diff --git a/tests/test_components/test_main.py b/tests/test_components/test_main.py
index 949049e..bbd4927 100644
--- a/tests/test_components/test_main.py
+++ b/tests/test_components/test_main.py
@@ -429,3 +429,15 @@ def test_update_entity(flm_sess):
         datatype="dataset", name=dataset_name, flexilims_session=flm_sess
     )
     assert repr(new_entity) == repr(original_entity)
+    with pytest.raises(FlexilimsError) as err:
+        flz.update_entity(
+            "dataset",
+            name=dataset_name,
+            flexilims_session=flm_sess,
+            attributes={
+                "path": "new/path",
+                "dataset_type": "scanimage",
+                "project": "random",
+                "createdBy": "BAD",
+            },
+        )

From 7efd79fed5f627666bdc022d6dad1be41027a091 Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Wed, 1 May 2024 13:46:28 +0100
Subject: [PATCH 54/73] [bugfix] add_mouse works with alive mice

---
 flexiznam/main.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/flexiznam/main.py b/flexiznam/main.py
index 6bf4c41..da6576f 100755
--- a/flexiznam/main.py
+++ b/flexiznam/main.py
@@ -217,12 +217,13 @@ def add_mouse(
         # format birthdate
         for date_type in ["birth_date", "death_date"]:
             d = mcms_info[date_type]
-            d = datetime.datetime.fromisoformat(d)
-            # birthdate is at midnight or 23 depending on the time zone
-            if d.hour <= 12:
-                date = d.strftime("%Y-%m-%d")
-            else:
-                date = (d + datetime.timedelta(days=1)).strftime("%Y-%m-%d")
+            if d is not None:
+                d = datetime.datetime.fromisoformat(d)
+                # birthdate is at midnight or 23 depending on the time zone
+                if d.hour <= 12:
+                    date = d.strftime("%Y-%m-%d")
+                else:
+                    date = (d + datetime.timedelta(days=1)).strftime("%Y-%m-%d")
             mcms_info[date_type] = date
         # update mouse_info with mcms_info but prioritise mouse_info for conflicts
         mouse_info = dict(mcms_info, **mouse_info)

From 0ecacf40713a05639dae0a32253355b293df62f6 Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Wed, 1 May 2024 13:48:11 +0100
Subject: [PATCH 55/73] [v0.3.11]

---
 CHANGELOG.md | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 50807b4..ec9328f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -23,6 +23,13 @@ is not. This is now supported upstream by `flexilims`
 - `update_config` actually adds the new fields (i.e. fields that are in the default
 config but not the local config) to the config file
 
+## v0.3.11
+
+### Bugfixes
+
+- Fix bugs related to raw_data for projects not in main folder
+- Add mouse works with alive animals
+
 
 ## v0.3.10
 
@@ -75,6 +82,23 @@ config but not the local config) to the config file
 - `get_children` output is filtered to contain only relevant columns when   `children_datatype` is not None
 
 ### Bugfixes
+## v0.3.6
+
+### Main changes
+
+- New `SequencingData` class to handle sequencing data
+- Add a `conda_envs` field in the config file to use in conjuction with `znamutils`
+- `get_children` can work with name or id (instead of id only)
+
+### Minor
+- `add_mouse` uploads birth and death dates in a human readable format instead.
+- `get_entities` does not raise warnings anymore if `name` is specified and `datatype`
+is not. This is now supported upstream by `flexilims`
+
+### Bugfixes
+
+- `update_config` actually adds the new fields (i.e. fields that are in the default
+config but not the local config) to the config file
 
 ## v0.3.5
 

From d0972e67a61ae5d5f0996f150e00baec584d1c35 Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Thu, 30 May 2024 10:03:30 +0100
Subject: [PATCH 56/73] [minor] set enforce_dataset_types to False by default

---
 flexiznam/config/default_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flexiznam/config/default_config.py b/flexiznam/config/default_config.py
index 5e0d4ae..55c7e2b 100644
--- a/flexiznam/config/default_config.py
+++ b/flexiznam/config/default_config.py
@@ -30,7 +30,7 @@
     # list of all datatypes
     datatypes=["mouse", "session", "recording", "dataset", "sample"],
     # should we limit the valid dataset types?
-    enforce_dataset_types=True,
+    enforce_dataset_types=False,
     # if we enforce, what is the list of valid dataset type
     dataset_types=[
         "scanimage",

From eeb1cc6d236a18e16d8f1d0a00e550f5749e3ccd Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Thu, 30 May 2024 10:10:17 +0100
Subject: [PATCH 57/73] [dev] add pre-commit config

---
 flexiznam/.pre-commit-config.yaml | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 flexiznam/.pre-commit-config.yaml

diff --git a/flexiznam/.pre-commit-config.yaml b/flexiznam/.pre-commit-config.yaml
new file mode 100644
index 0000000..b9d55f8
--- /dev/null
+++ b/flexiznam/.pre-commit-config.yaml
@@ -0,0 +1,28 @@
+exclude: 'conf.py'
+
+# Configuring https://pre-commit.ci/
+ci:
+    autoupdate_schedule: monthly
+
+repos:
+    - repo: https://github.com/pre-commit/pre-commit-hooks
+      rev: v4.4.0
+      hooks:
+          - id: check-docstring-first
+          - id: check-executables-have-shebangs
+          - id: check-merge-conflict
+          - id: check-toml
+          - id: end-of-file-fixer
+          - id: mixed-line-ending
+            args: [--fix=lf]
+          - id: requirements-txt-fixer
+          - id: trailing-whitespace
+    - repo: https://github.com/psf/black
+      rev: 23.3.0
+      hooks:
+          - id: black
+    - repo: https://github.com/kynan/nbstripout
+      rev: 0.6.1
+      hooks:
+          - id: nbstripout
+            args: [--extra-keys=metadata.language_info.version metadata.kernelspec.name metadata.kernelspec.display_name]

From 86b93a40a7c5b799a9a7f7183fd283ce71d38f05 Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Thu, 30 May 2024 10:12:47 +0100
Subject: [PATCH 58/73] [feature] extra_attributes in from_origin

To be able to return the dataset with the same arguements if it exists or create a new one otherwise (with `append`)
---
 CHANGELOG.md                 |  2 ++
 flexiznam/schema/datasets.py | 61 ++++++++++++++++++++++++++----------
 2 files changed, 46 insertions(+), 17 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ec9328f..ff73ed2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,8 @@
 - Add a `conda_envs` field in the config file to use in conjuction with `znamutils`
 - `get_children` can work with name or id (instead of id only)
 - `check_flexilims_issues` can now add missing paths
+- `Dataset.from_origin` has a new `extra_attributes` argument to match online datasets
+  with specific attributes only.
 
 ### Minor
 - `add_mouse` uploads birth and death dates in a human readable format instead.
diff --git a/flexiznam/schema/datasets.py b/flexiznam/schema/datasets.py
index 33eb04c..0f1c3fd 100644
--- a/flexiznam/schema/datasets.py
+++ b/flexiznam/schema/datasets.py
@@ -131,20 +131,22 @@ def from_origin(
         base_name=None,
         conflicts=None,
         flexilims_session=None,
+        extra_arguments=None,
     ):
         """Creates a dataset of a given type as a child of a parent entity
 
         Args:
             project (str): Name of the project or hexadecimal project_id
             origin_type (str): sample type of the origin
-            origin_id (str): hexadecimal ID of the origin. This or origin_name must be provided
+            origin_id (str): hexadecimal ID of the origin. This or origin_name must be
+                provided
             origin_name (str): name of the origin. This or origin_id must be provided
-            dataset_type (str): type of dataset to create. Must be defined in the config file
+            dataset_type (str): type of dataset to create. Must be defined in the config
+                file
             base_name (str): How is this dataset name? Use dataset_type if base_name is
                              None (default)
             conflicts (str): What to do if a dataset of this type already exists
-                as a child of the parent entity?
-
+                as a child of the parent entity? Behaviour modified by `extra_arguments`
                 `append`
                     Create a new dataset with a new name and path
                 `abort` or None
@@ -152,9 +154,13 @@ def from_origin(
                     exit
                 `skip` or `overwrite`
                     Return a Dataset corresponding to the existing entry if there
-                    is exactly one existing entry, otherwise through a
+                    is exactly one existing entry, otherwise throw a
                     :py:class:`flexiznam.errors.NameNotUniqueError`
-            flexilims_session (:py:class:`flexilims.Flexilims`): authentication session to connect to flexilims
+            flexilims_session (:py:class:`flexilims.Flexilims`): authentication session
+                to connect to flexilims
+            extra_arguments (dict): additional arguments. If provided, change the
+                `conflicts` behaviour to consider only datasets that have the exact
+                same extra_arguments.
 
         Returns:
             :py:class:`flexiznam.schema.datasets.Dataset`: a dataset object (WITHOUT updating flexilims)
@@ -185,8 +191,25 @@ def from_origin(
             processed = processed[
                 [g[-1].startswith(base_name) for g in processed.genealogy]
             ]
+
+        # If extra_arguments is provided, only consider datasets that have the exact
+        # same extra_arguments
+        if extra_arguments is not None:
+            valid_processed = []
+            for _, proc in processed.iterrows():
+                online = Dataset._format_series_to_kwargs(proc)["extra_attributes"]
+                differences = utils.compare_dictionaries_recursively(
+                    utils.clean_recursively(extra_arguments), online
+                )
+                if not differences:
+                    valid_processed.append(proc)
+        else:
+            valid_processed = processed
+
         already_processed = len(processed) > 0
-        if (not already_processed) or (conflicts == "append"):
+        if (not already_processed) or (
+            (not len(valid_processed)) and conflicts == "append"
+        ):
             dataset_root = "%s_%s" % (origin["name"], base_name)
             dataset_name = flz.generate_name(
                 "dataset",
@@ -208,21 +231,25 @@ def from_origin(
                 flexilims_session=flexilims_session,
             )
         else:
+            # There are some datasets of this type already online
             if (conflicts is None) or (conflicts == "abort"):
                 raise flz.errors.DatasetError(
                     f"Dataset(s) of type {dataset_type} already exist(s):"
                     + f" {processed.loc[:, 'name']}"
                 )
-            elif conflicts == "skip" or conflicts == "overwrite":
-                if len(processed) == 1:
-                    return Dataset.from_dataseries(dataseries=processed.iloc[0])
-                else:
-                    raise flz.errors.NameNotUniqueError(
-                        "{} {} datasets with name starting by {} exists for {}, "
-                        "which one to return?".format(
-                            len(processed), dataset_type, base_name, origin["name"]
-                        )
-                    )
+            elif conflicts == "skip" and len(valid_processed) == 1:
+                # If skip, ensure extra_arguments are the same
+                return Dataset.from_dataseries(dataseries=valid_processed[0])
+            elif conflicts == "overwrite" and len(processed) == 1:
+                # If overwrite, ensure there is only one dataset of this type as we
+                # won't be able to guess which one should be replaced
+                return Dataset.from_dataseries(dataseries=processed.iloc[0])
+            else:
+                txt = f"{len(processed)} {dataset_type} datasets with name starting by"
+                txt += f" {base_name} exists for {origin['name']}"
+                if extra_arguments:
+                    txt += f", {len(valid_processed)} matching extra_arguments"
+                raise flz.errors.NameNotUniqueError(txt)
 
     @staticmethod
     def _format_series_to_kwargs(flm_series):

From 2544d35322202db4289f0393e1ad003bd5d0587f Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Thu, 30 May 2024 10:15:22 +0100
Subject: [PATCH 59/73] [minor] remove temp file that should not have been
 commited

---
 temp.py | 14 --------------
 1 file changed, 14 deletions(-)
 delete mode 100644 temp.py

diff --git a/temp.py b/temp.py
deleted file mode 100644
index 872bbce..0000000
--- a/temp.py
+++ /dev/null
@@ -1,14 +0,0 @@
-from flexiznam.config import config_tools
-
-ymlfile = "/camp/home/blota/home/users/blota/temp/s20230605_valid.yml"
-from flexiznam.camp import sync_data as sd
-
-o = sd.parse_yaml(ymlfile)
-
-
-config_folder = None
-fname = config_tools._find_file("config.yml", config_folder=config_folder)
-prm = config_tools.load_param(param_folder=config_folder)
-config_tools.update_config(
-    param_file="config.yml", config_folder=config_folder, add_all_projects=True, **prm
-)

From c84c123e62dd3c0247ce8655b216264ed1e8829a Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Thu, 30 May 2024 11:05:43 +0100
Subject: [PATCH 60/73] [bugfix] correct behaviour of from_origin

with extra_attributes we need to still be able to append if we want
---
 flexiznam/schema/datasets.py | 111 +++++++++++++++++++++++++----------
 1 file changed, 79 insertions(+), 32 deletions(-)

diff --git a/flexiznam/schema/datasets.py b/flexiznam/schema/datasets.py
index 0f1c3fd..d677622 100644
--- a/flexiznam/schema/datasets.py
+++ b/flexiznam/schema/datasets.py
@@ -135,6 +135,18 @@ def from_origin(
     ):
         """Creates a dataset of a given type as a child of a parent entity
 
+        This function will create a dataset with a unique name based on the origin name
+        and the dataset type. If a dataset of this type already exists, the behaviour is
+        defined by the `conflicts` argument. If `extra_arguments` is provided, only
+        consider datasets that have the exact same extra_arguments when resolving
+        conflicts (see next paragraph for details).
+
+        Conflicts can be resolved in the following ways:
+        - `abort`: raise an error if a dataset of this type already exists
+
+
+
+
         Args:
             project (str): Name of the project or hexadecimal project_id
             origin_type (str): sample type of the origin
@@ -145,17 +157,8 @@ def from_origin(
                 file
             base_name (str): How is this dataset name? Use dataset_type if base_name is
                              None (default)
-            conflicts (str): What to do if a dataset of this type already exists
-                as a child of the parent entity? Behaviour modified by `extra_arguments`
-                `append`
-                    Create a new dataset with a new name and path
-                `abort` or None
-                    Through a :py:class:`flexiznam.errors.NameNotUniqueError` and
-                    exit
-                `skip` or `overwrite`
-                    Return a Dataset corresponding to the existing entry if there
-                    is exactly one existing entry, otherwise throw a
-                    :py:class:`flexiznam.errors.NameNotUniqueError`
+            conflicts (str): How to resolve conflicts? One of `abort`, `skip`, `append`,
+                `overwrite`. Default is `abort`
             flexilims_session (:py:class:`flexilims.Flexilims`): authentication session
                 to connect to flexilims
             extra_arguments (dict): additional arguments. If provided, change the
@@ -207,9 +210,16 @@ def from_origin(
             valid_processed = processed
 
         already_processed = len(processed) > 0
-        if (not already_processed) or (
-            (not len(valid_processed)) and conflicts == "append"
+
+        def _create_new_ds(
+            origin,
+            base_name,
+            project,
+            flexilims_session,
+            dataset_type,
+            extra_attributes,
         ):
+            """Inner function to create a new dataset object"""
             dataset_root = "%s_%s" % (origin["name"], base_name)
             dataset_name = flz.generate_name(
                 "dataset",
@@ -229,27 +239,64 @@ def from_origin(
                 project=project,
                 origin_id=origin["id"],
                 flexilims_session=flexilims_session,
+                extra_attributes=extra_attributes,
             )
-        else:
-            # There are some datasets of this type already online
-            if (conflicts is None) or (conflicts == "abort"):
-                raise flz.errors.DatasetError(
-                    f"Dataset(s) of type {dataset_type} already exist(s):"
-                    + f" {processed.loc[:, 'name']}"
-                )
-            elif conflicts == "skip" and len(valid_processed) == 1:
-                # If skip, ensure extra_arguments are the same
-                return Dataset.from_dataseries(dataseries=valid_processed[0])
-            elif conflicts == "overwrite" and len(processed) == 1:
-                # If overwrite, ensure there is only one dataset of this type as we
-                # won't be able to guess which one should be replaced
+
+        # CONFLICTS RESOLUTION
+        # There are no datasets, create one
+        if not already_processed:
+            return _create_new_ds(
+                origin,
+                base_name,
+                project,
+                flexilims_session,
+                dataset_type,
+                extra_arguments,
+            )
+        # There are some datasets of this type already online and we abort
+        if (conflicts is None) or (conflicts == "abort"):
+            raise flz.errors.DatasetError(
+                f"Dataset(s) of type {dataset_type} already exist(s):"
+                + f" {processed.loc[:, 'name']}"
+            )
+        # Three cases left: skip, append, overwrite
+        if conflicts == "overwrite":
+            # If overwrite, ensure there is only one dataset of this type as we
+            # won't be able to guess which one should be replaced
+            if len(processed) == 1:
                 return Dataset.from_dataseries(dataseries=processed.iloc[0])
-            else:
-                txt = f"{len(processed)} {dataset_type} datasets with name starting by"
-                txt += f" {base_name} exists for {origin['name']}"
-                if extra_arguments:
-                    txt += f", {len(valid_processed)} matching extra_arguments"
-                raise flz.errors.NameNotUniqueError(txt)
+            raise flz.errors.NameNotUniqueError(
+                f"Multiple datasets of type {dataset_type} already exist(s):"
+                + f" {processed.loc[:, 'name']}"
+            )
+        if conflicts == "skip":
+            # If skip and we have an exact match, return it
+            if len(valid_processed) == 1:
+                return Dataset.from_dataseries(dataseries=valid_processed[0])
+            # If there is no match, create a new dataset
+            if len(valid_processed) == 0:
+                return _create_new_ds(
+                    origin,
+                    base_name,
+                    project,
+                    flexilims_session,
+                    dataset_type,
+                    extra_arguments,
+                )
+            raise flz.errors.NameNotUniqueError(
+                f"Multiple datasets of type {dataset_type} already exist(s):"
+                + f" {processed.loc[:, 'name']}"
+            )
+        if conflicts == "append":
+            # Create a new dataset
+            return _create_new_ds(
+                origin,
+                base_name,
+                project,
+                flexilims_session,
+                dataset_type,
+                extra_arguments,
+            )
 
     @staticmethod
     def _format_series_to_kwargs(flm_series):

From 4a2fe9e33959e48a383f8699d4cc8f34b609357e Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Thu, 30 May 2024 18:07:46 +0100
Subject: [PATCH 61/73] [bugfix] correct extra_attribute in from_origin

- rename to match other places
- correct beahviour with append, always append
---
 flexiznam/schema/datasets.py | 42 +++++++++++++++++++-----------------
 1 file changed, 22 insertions(+), 20 deletions(-)

diff --git a/flexiznam/schema/datasets.py b/flexiznam/schema/datasets.py
index d677622..1b0e93b 100644
--- a/flexiznam/schema/datasets.py
+++ b/flexiznam/schema/datasets.py
@@ -131,20 +131,16 @@ def from_origin(
         base_name=None,
         conflicts=None,
         flexilims_session=None,
-        extra_arguments=None,
+        extra_attributes=None,
+        ignore_attributes=(),
     ):
         """Creates a dataset of a given type as a child of a parent entity
 
         This function will create a dataset with a unique name based on the origin name
         and the dataset type. If a dataset of this type already exists, the behaviour is
-        defined by the `conflicts` argument. If `extra_arguments` is provided, only
-        consider datasets that have the exact same extra_arguments when resolving
-        conflicts (see next paragraph for details).
-
-        Conflicts can be resolved in the following ways:
-        - `abort`: raise an error if a dataset of this type already exists
-
-
+        defined by the `conflicts` argument. If `extra_attributes` is provided, only
+        consider datasets that have the exact same extra_attributes when resolving
+        conflicts.
 
 
         Args:
@@ -161,9 +157,11 @@ def from_origin(
                 `overwrite`. Default is `abort`
             flexilims_session (:py:class:`flexilims.Flexilims`): authentication session
                 to connect to flexilims
-            extra_arguments (dict): additional arguments. If provided, change the
+            extra_attributes (dict): additional arguments. If provided, change the
                 `conflicts` behaviour to consider only datasets that have the exact
-                same extra_arguments.
+                same extra_attributes.
+            ignore_attributes (list): list of arguments to ignore when comparing datasets
+                for conflicts resolution. Used only if `extra_attributes` is provided.
 
         Returns:
             :py:class:`flexiznam.schema.datasets.Dataset`: a dataset object (WITHOUT updating flexilims)
@@ -195,15 +193,17 @@ def from_origin(
                 [g[-1].startswith(base_name) for g in processed.genealogy]
             ]
 
-        # If extra_arguments is provided, only consider datasets that have the exact
-        # same extra_arguments
-        if extra_arguments is not None:
+        # If extra_attributes is provided, only consider datasets that have the exact
+        # same extra_attributes
+        if extra_attributes is not None:
             valid_processed = []
+            to_compare = utils.clean_recursively(
+                extra_attributes.copy(), keys=ignore_attributes
+            )
             for _, proc in processed.iterrows():
                 online = Dataset._format_series_to_kwargs(proc)["extra_attributes"]
-                differences = utils.compare_dictionaries_recursively(
-                    utils.clean_recursively(extra_arguments), online
-                )
+                online = utils.clean_recursively(online, keys=ignore_attributes)
+                differences = utils.compare_dictionaries_recursively(to_compare, online)
                 if not differences:
                     valid_processed.append(proc)
         else:
@@ -251,7 +251,7 @@ def _create_new_ds(
                 project,
                 flexilims_session,
                 dataset_type,
-                extra_arguments,
+                extra_attributes,
             )
         # There are some datasets of this type already online and we abort
         if (conflicts is None) or (conflicts == "abort"):
@@ -263,6 +263,8 @@ def _create_new_ds(
         if conflicts == "overwrite":
             # If overwrite, ensure there is only one dataset of this type as we
             # won't be able to guess which one should be replaced
+            if len(valid_processed) == 1:
+                return Dataset.from_dataseries(dataseries=valid_processed[0])
             if len(processed) == 1:
                 return Dataset.from_dataseries(dataseries=processed.iloc[0])
             raise flz.errors.NameNotUniqueError(
@@ -281,7 +283,7 @@ def _create_new_ds(
                     project,
                     flexilims_session,
                     dataset_type,
-                    extra_arguments,
+                    extra_attributes,
                 )
             raise flz.errors.NameNotUniqueError(
                 f"Multiple datasets of type {dataset_type} already exist(s):"
@@ -295,7 +297,7 @@ def _create_new_ds(
                 project,
                 flexilims_session,
                 dataset_type,
-                extra_arguments,
+                extra_attributes,
             )
 
     @staticmethod

From 6fcf2f284014372f413bdf441e3528328559be58 Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Mon, 17 Jun 2024 11:09:53 +0100
Subject: [PATCH 62/73] [feature] add delete recursively

---
 flexiznam/main.py | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/flexiznam/main.py b/flexiznam/main.py
index da6576f..7c3f43f 100755
--- a/flexiznam/main.py
+++ b/flexiznam/main.py
@@ -1317,3 +1317,33 @@ def format_results(results, return_list=False):
     if return_list:
         return results
     return pd.DataFrame(results)
+
+
+def delete_recursively(source_id, flexilims_session, do_it=False):
+    """Delete an entity and all its children recursively
+
+    Args:
+        source_id (str): hexadecimal ID of the entity to delete
+        flexilims_session (:py:class:`flexilims.Flexilims`): Flexylims session object
+        do_it (bool): if True, will actually delete the entities
+
+    Returns:
+        list: hexadecimal IDs of the entities to delete
+
+    """
+    to_delete = []
+
+    def _get_children(parent_id):
+        children = get_children(
+            parent_id=parent_id, flexilims_session=flexilims_session
+        )
+        for _, child in children.iterrows():
+            to_delete.append(child["id"])
+            if child["type"] != "dataset":
+                _get_children(child["id"])
+
+    _get_children(source_id)
+    if do_it:
+        for child_id in to_delete:
+            flexilims_session.delete(child_id)
+    return to_delete

From 291087dd81be1c38a317a65ad873f621d783a4b4 Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Mon, 17 Jun 2024 11:10:45 +0100
Subject: [PATCH 63/73] [doc] edit changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ff73ed2..b5439f6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,7 @@
 - `check_flexilims_issues` can now add missing paths
 - `Dataset.from_origin` has a new `extra_attributes` argument to match online datasets
   with specific attributes only.
+- `delete_recursively` can delete all children of an entity
 
 ### Minor
 - `add_mouse` uploads birth and death dates in a human readable format instead.

From 3dbbde6c290de6d645093ab64e1ba7ac40888afc Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Tue, 18 Jun 2024 16:26:48 +0100
Subject: [PATCH 64/73] [bugfix] from_origin respects base_name

It was looking for anything starting with base_name, improve by ensuring that base_name is followed by '_'

Still risk of confusion between `poorly_chosen` nad
`poorly_chosen_name` for instance
---
 flexiznam/schema/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flexiznam/schema/datasets.py b/flexiznam/schema/datasets.py
index 1b0e93b..7148370 100644
--- a/flexiznam/schema/datasets.py
+++ b/flexiznam/schema/datasets.py
@@ -190,7 +190,7 @@ def from_origin(
         )
         if len(processed):
             processed = processed[
-                [g[-1].startswith(base_name) for g in processed.genealogy]
+                [g[-1].startswith(base_name + "_") for g in processed.genealogy]
             ]
 
         # If extra_attributes is provided, only consider datasets that have the exact

From f23f47d398788ebc04dd66775b3da348ae2369bb Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Tue, 18 Jun 2024 16:27:25 +0100
Subject: [PATCH 65/73] [minor] verbose option for from_origin

To display if a match was found or a new dataset created
---
 flexiznam/schema/datasets.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/flexiznam/schema/datasets.py b/flexiznam/schema/datasets.py
index 7148370..a636e2c 100644
--- a/flexiznam/schema/datasets.py
+++ b/flexiznam/schema/datasets.py
@@ -133,6 +133,7 @@ def from_origin(
         flexilims_session=None,
         extra_attributes=None,
         ignore_attributes=(),
+        verbose=False,
     ):
         """Creates a dataset of a given type as a child of a parent entity
 
@@ -162,6 +163,7 @@ def from_origin(
                 same extra_attributes.
             ignore_attributes (list): list of arguments to ignore when comparing datasets
                 for conflicts resolution. Used only if `extra_attributes` is provided.
+            verbose (bool): print debug information
 
         Returns:
             :py:class:`flexiznam.schema.datasets.Dataset`: a dataset object (WITHOUT updating flexilims)
@@ -245,6 +247,8 @@ def _create_new_ds(
         # CONFLICTS RESOLUTION
         # There are no datasets, create one
         if not already_processed:
+            if verbose:
+                print("No datasets of type %s found. Creating new" % dataset_type)
             return _create_new_ds(
                 origin,
                 base_name,
@@ -264,8 +268,12 @@ def _create_new_ds(
             # If overwrite, ensure there is only one dataset of this type as we
             # won't be able to guess which one should be replaced
             if len(valid_processed) == 1:
+                if verbose:
+                    print("Overwriting dataset %s" % valid_processed.iloc[0].name)
                 return Dataset.from_dataseries(dataseries=valid_processed[0])
             if len(processed) == 1:
+                if verbose:
+                    print("Overwriting dataset %s" % processed.iloc[0].name)
                 return Dataset.from_dataseries(dataseries=processed.iloc[0])
             raise flz.errors.NameNotUniqueError(
                 f"Multiple datasets of type {dataset_type} already exist(s):"
@@ -274,9 +282,13 @@ def _create_new_ds(
         if conflicts == "skip":
             # If skip and we have an exact match, return it
             if len(valid_processed) == 1:
+                if verbose:
+                    print("Skip. Returning dataset %s" % valid_processed.iloc[0].name)
                 return Dataset.from_dataseries(dataseries=valid_processed[0])
             # If there is no match, create a new dataset
             if len(valid_processed) == 0:
+                if verbose:
+                    print("No matching dataset found. Creating new dataset")
                 return _create_new_ds(
                     origin,
                     base_name,
@@ -291,6 +303,8 @@ def _create_new_ds(
             )
         if conflicts == "append":
             # Create a new dataset
+            if verbose:
+                print("Appending dataset")
             return _create_new_ds(
                 origin,
                 base_name,

From 11cad5aafeabfaf7bd42421918adeeb3b34300cc Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Thu, 20 Jun 2024 15:49:51 +0100
Subject: [PATCH 66/73] [bugfix] verbose print had syntax error

---
 flexiznam/schema/datasets.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/flexiznam/schema/datasets.py b/flexiznam/schema/datasets.py
index a636e2c..08026e9 100644
--- a/flexiznam/schema/datasets.py
+++ b/flexiznam/schema/datasets.py
@@ -269,7 +269,7 @@ def _create_new_ds(
             # won't be able to guess which one should be replaced
             if len(valid_processed) == 1:
                 if verbose:
-                    print("Overwriting dataset %s" % valid_processed.iloc[0].name)
+                    print("Overwriting dataset %s" % valid_processed[0].name)
                 return Dataset.from_dataseries(dataseries=valid_processed[0])
             if len(processed) == 1:
                 if verbose:
@@ -283,7 +283,7 @@ def _create_new_ds(
             # If skip and we have an exact match, return it
             if len(valid_processed) == 1:
                 if verbose:
-                    print("Skip. Returning dataset %s" % valid_processed.iloc[0].name)
+                    print("Skip. Returning dataset %s" % valid_processed[0].name)
                 return Dataset.from_dataseries(dataseries=valid_processed[0])
             # If there is no match, create a new dataset
             if len(valid_processed) == 0:

From 5d5a303ec87428853ecf989981b16a49e9f65362 Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Wed, 24 Jul 2024 16:59:59 +0100
Subject: [PATCH 67/73] [bugfix] from_origin overwrite extra_attributes

it was not overwriting
---
 flexiznam/schema/datasets.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/flexiznam/schema/datasets.py b/flexiznam/schema/datasets.py
index 08026e9..51f343b 100644
--- a/flexiznam/schema/datasets.py
+++ b/flexiznam/schema/datasets.py
@@ -270,11 +270,15 @@ def _create_new_ds(
             if len(valid_processed) == 1:
                 if verbose:
                     print("Overwriting dataset %s" % valid_processed[0].name)
-                return Dataset.from_dataseries(dataseries=valid_processed[0])
+                dataset = Dataset.from_dataseries(dataseries=valid_processed[0])
+                dataset.extra_attributes = extra_attributes
+                return dataset
             if len(processed) == 1:
                 if verbose:
                     print("Overwriting dataset %s" % processed.iloc[0].name)
-                return Dataset.from_dataseries(dataseries=processed.iloc[0])
+                dataset = Dataset.from_dataseries(dataseries=processed.iloc[0])
+                dataset.extra_attributes = extra_attributes
+                return dataset
             raise flz.errors.NameNotUniqueError(
                 f"Multiple datasets of type {dataset_type} already exist(s):"
                 + f" {processed.loc[:, 'name']}"

From 897fee8ae655b329b5c3bc56b02b5a845214209f Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Fri, 26 Jul 2024 11:03:58 +0100
Subject: [PATCH 68/73] [bugfix] delete_recursively deletes source

---
 flexiznam/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flexiznam/main.py b/flexiznam/main.py
index 7c3f43f..97f4f95 100755
--- a/flexiznam/main.py
+++ b/flexiznam/main.py
@@ -1331,7 +1331,7 @@ def delete_recursively(source_id, flexilims_session, do_it=False):
         list: hexadecimal IDs of the entities to delete
 
     """
-    to_delete = []
+    to_delete = [source_id]
 
     def _get_children(parent_id):
         children = get_children(

From 3eaca8aa15ceef8ceb59c3351bbccbcc7b7d15c8 Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Mon, 5 Aug 2024 18:44:47 +0100
Subject: [PATCH 69/73] [bugfix] in from_origin with skip

---
 flexiznam/schema/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flexiznam/schema/datasets.py b/flexiznam/schema/datasets.py
index 51f343b..eca983e 100644
--- a/flexiznam/schema/datasets.py
+++ b/flexiznam/schema/datasets.py
@@ -209,7 +209,7 @@ def from_origin(
                 if not differences:
                     valid_processed.append(proc)
         else:
-            valid_processed = processed
+            valid_processed = [ser for _, ser in processed.iterrows()]
 
         already_processed = len(processed) > 0
 

From e0d1f6067d047da291ab162643df9beba4672e6d Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Thu, 19 Sep 2024 09:17:09 +0100
Subject: [PATCH 70/73] [feature] option to get session in offline mode

---
 flexiznam/main.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/flexiznam/main.py b/flexiznam/main.py
index 97f4f95..920b8fc 100755
--- a/flexiznam/main.py
+++ b/flexiznam/main.py
@@ -10,7 +10,7 @@
 import yaml
 from flexiznam import mcms
 from flexiznam.config import PARAMETERS, get_password
-from flexiznam.errors import NameNotUniqueError, FlexilimsError
+from flexiznam.errors import NameNotUniqueError, FlexilimsError, ConfigurationError
 
 
 warnings.simplefilter("always", DeprecationWarning)
@@ -93,6 +93,18 @@ def get_flexilims_session(
     Returns:
         :py:class:`flexilims.Flexilims`: Flexilims session object.
     """
+    offline_mode = PARAMETERS.get("offline_mode", False)
+    if offline_mode:
+        yaml_file = PARAMETERS.get("offline_yaml", None)
+        if yaml_file is None:
+            raise ConfigurationError("offline_mode is set but offline_yaml is not")
+        yaml_file = Path(yaml_file)
+        if not yaml_file.exists():
+            yaml_file = get_data_root("processed") / yaml_file
+        if not yaml_file.exists():
+            raise ConfigurationError(f"offline_yaml file {yaml_file} not found")
+        flexilims_session = flm.OfflineFlexilims(yaml_file)
+        return flexilims_session
 
     if project_id is not None:
         project_id = _format_project(project_id, PARAMETERS)

From 4e5590c696008f3e184c711d1225f83f75d8b8ba Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Fri, 20 Sep 2024 18:05:30 +0100
Subject: [PATCH 71/73] [feature] option to set offline mode outside of ops

It is sometimes convenient to have both online and offline sessions at the same time.
Do it by specifying arg in get_session
---
 flexiznam/main.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/flexiznam/main.py b/flexiznam/main.py
index 920b8fc..4e0ff94 100755
--- a/flexiznam/main.py
+++ b/flexiznam/main.py
@@ -76,6 +76,7 @@ def get_flexilims_session(
     password=None,
     reuse_token=True,
     timeout=10,
+    offline_mode=None,
 ):
     """Open a new flexilims session by creating a new authentication token.
 
@@ -88,12 +89,19 @@ def get_flexilims_session(
             read from the secrets file, or failing that triggers an input prompt.
         reuse_token (bool): (optional) if True, try to reuse an existing token
         timeout (int): (optional) timeout in seconds for the portalocker lock. Default
-            to 10.
+                to 10.
+        offline_mode (bool): (optional) if True, will use an offline session. In this
+            case, the `offline_yaml` parameter must be set in the config file. If
+            not provided, will look for the `offline_mode` parameter in the config
+            file. Default to None.
+        
 
     Returns:
         :py:class:`flexilims.Flexilims`: Flexilims session object.
     """
-    offline_mode = PARAMETERS.get("offline_mode", False)
+    if offline_mode is None:
+        offline_mode = PARAMETERS.get("offline_mode", False)
+        
     if offline_mode:
         yaml_file = PARAMETERS.get("offline_yaml", None)
         if yaml_file is None:

From 046c2ccb8be6cdfa30ccdf0f573b167ebb2f383a Mon Sep 17 00:00:00 2001
From: Antonin Blot <antonin.blot@crick.ac.uk>
Date: Mon, 23 Sep 2024 16:18:44 +0100
Subject: [PATCH 72/73] [minor] adapt to offline flexilims

---
 flexiznam/main.py            | 18 ++++++++++--------
 flexiznam/schema/datasets.py |  2 +-
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/flexiznam/main.py b/flexiznam/main.py
index 4e0ff94..6347b4e 100755
--- a/flexiznam/main.py
+++ b/flexiznam/main.py
@@ -94,30 +94,32 @@ def get_flexilims_session(
             case, the `offline_yaml` parameter must be set in the config file. If
             not provided, will look for the `offline_mode` parameter in the config
             file. Default to None.
-        
+
 
     Returns:
         :py:class:`flexilims.Flexilims`: Flexilims session object.
     """
+
+    if project_id is not None:
+        project_id = _format_project(project_id, PARAMETERS)
+    else:
+        warnings.warn("Starting flexilims session without setting project_id.")
+
     if offline_mode is None:
         offline_mode = PARAMETERS.get("offline_mode", False)
-        
+
     if offline_mode:
         yaml_file = PARAMETERS.get("offline_yaml", None)
         if yaml_file is None:
             raise ConfigurationError("offline_mode is set but offline_yaml is not")
         yaml_file = Path(yaml_file)
         if not yaml_file.exists():
-            yaml_file = get_data_root("processed") / yaml_file
+            yaml_file = get_data_root("processed", project=project_id) / yaml_file
         if not yaml_file.exists():
             raise ConfigurationError(f"offline_yaml file {yaml_file} not found")
-        flexilims_session = flm.OfflineFlexilims(yaml_file)
+        flexilims_session = flm.OfflineFlexilims(yaml_file, project_id=project_id)
         return flexilims_session
 
-    if project_id is not None:
-        project_id = _format_project(project_id, PARAMETERS)
-    else:
-        warnings.warn("Starting flexilims session without setting project_id.")
     if username is None:
         username = PARAMETERS["flexilims_username"]
     if password is None:
diff --git a/flexiznam/schema/datasets.py b/flexiznam/schema/datasets.py
index eca983e..2da759e 100644
--- a/flexiznam/schema/datasets.py
+++ b/flexiznam/schema/datasets.py
@@ -690,7 +690,7 @@ def flexilims_session(self, value):
         self._flexilims_session = value
         if value is None:
             return
-        if hasattr(value, "project_id"):
+        if hasattr(value, "project_id") and (value.project_id is not None):
             if self.project_id is None:
                 self.project_id = value.project_id
             elif self.project_id != value.project_id:

From 465730c13a8ae2c2c233c72020520e8324a5dc31 Mon Sep 17 00:00:00 2001
From: Petr Znamenskiy <petr.znamenskiy@crick.ac.uk>
Date: Thu, 26 Sep 2024 14:06:43 +0100
Subject: [PATCH 73/73] Update CHANGELOG.md

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b5439f6..b28132d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,7 @@
 - `Dataset.from_origin` has a new `extra_attributes` argument to match online datasets
   with specific attributes only.
 - `delete_recursively` can delete all children of an entity
+- Offline mode using downloaded copy of the database
 
 ### Minor
 - `add_mouse` uploads birth and death dates in a human readable format instead.