From 1f4a2b9813b0d6f8ccc2765d4e4b567c8b718a15 Mon Sep 17 00:00:00 2001
From: Altana Namsaraeva <99650244+namsaraeva@users.noreply.github.com>
Date: Wed, 11 Sep 2024 11:16:12 +0200
Subject: [PATCH 01/56] Update requirements_dev.txt

---
 requirements_dev.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements_dev.txt b/requirements_dev.txt
index bb4f4adc..9057b769 100644
--- a/requirements_dev.txt
+++ b/requirements_dev.txt
@@ -36,7 +36,7 @@ pyqt5
 lxml_html_clean
 ashlar @ git+https://github.com/labsyspharm/ashlar.git@master
 networkx
-py-lmd @ git+https://github.com/MannLabs/py-lmd.git@refs/pull/11/head#egg=py-lmd
+py-lmd
 
 #packages for building the documentation
 sphinx

From edc1274e9acb1fd547460ee4ab38581f969493f6 Mon Sep 17 00:00:00 2001
From: Altana Namsaraeva <99650244+namsaraeva@users.noreply.github.com>
Date: Wed, 11 Sep 2024 11:16:48 +0200
Subject: [PATCH 02/56] Update requirements.txt

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 1fe95023..7a2f2d12 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -36,4 +36,4 @@ pyqt5
 lxml_html_clean
 ashlar @ git+https://github.com/labsyspharm/ashlar.git@master
 networkx
-py-lmd @ git+https://github.com/MannLabs/py-lmd.git@refs/pull/11/head#egg=py-lmd
+py-lmd

From 42f94f7cd2a1ac9fc20549e2f80a101f278a936f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Thu, 19 Dec 2024 16:04:02 +0100
Subject: [PATCH 03/56] cleanup not required variables

---
 examples/notebooks                    |  2 +-
 src/scportrait/pipeline/_base.py      | 21 +++++++++++++++++++++
 src/scportrait/pipeline/extraction.py | 10 ++++++----
 3 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/examples/notebooks b/examples/notebooks
index 17b3196e..bd8f8f04 160000
--- a/examples/notebooks
+++ b/examples/notebooks
@@ -1 +1 @@
-Subproject commit 17b3196ec1459f7cbc3a155e6e3285ec64b25db9
+Subproject commit bd8f8f041aa02bd4d18f1be4655f9e7cc4dfa307
diff --git a/src/scportrait/pipeline/_base.py b/src/scportrait/pipeline/_base.py
index ab35274c..fcc50d97 100644
--- a/src/scportrait/pipeline/_base.py
+++ b/src/scportrait/pipeline/_base.py
@@ -91,6 +91,27 @@ def _clean_log_file(self):
 
         if os.path.exists(log_file_path):
             os.remove(log_file_path)
+    
+    # def _clear_cache(self, vars_to_delete=None):
+    #     """Helper function to help clear memory usage. Mainly relevant for GPU based segmentations.
+
+    #     Args:
+    #         vars_to_delete (list): List of variable names (as strings) to delete.
+    #     """
+
+    #     # delete all specified variables
+    #     if vars_to_delete is not None:
+    #         for var_name in vars_to_delete:
+    #             if var_name in globals():
+    #                 del globals()[var_name]
+
+    #     if torch.cuda.is_available():
+    #         torch.cuda.empty_cache()
+
+    #     if torch.backends.mps.is_available():
+    #         torch.mps.empty_cache()
+
+    #     gc.collect()
 
     def _clear_cache(self, vars_to_delete=None):
         """Helper function to help clear memory usage. Mainly relevant for GPU based segmentations."""
diff --git a/src/scportrait/pipeline/extraction.py b/src/scportrait/pipeline/extraction.py
index ff51dfdf..9eb46647 100644
--- a/src/scportrait/pipeline/extraction.py
+++ b/src/scportrait/pipeline/extraction.py
@@ -877,7 +877,7 @@ def process(self, partial=False, n_cells=None, seed=42):
         else:
             # set up function for multi-threaded processing
             f = func_partial(self._extract_classes_multi, self.px_centers)
-            batched_args = self._generate_batched_args(args)
+            args = self._generate_batched_args(args)
 
             self.log(f"Running in multiprocessing mode with {self.threads} threads.")
             with mp.get_context("fork").Pool(
@@ -885,17 +885,19 @@ def process(self, partial=False, n_cells=None, seed=42):
             ) as pool:  # both spawn and fork work but fork is faster so forcing fork here
                 results = list(
                     tqdm(
-                        pool.imap(f, batched_args),
-                        total=len(batched_args),
+                        pool.imap(f, args),
+                        total=len(args),
                         desc="Processing cell batches",
                     )
                 )
                 pool.close()
                 pool.join()
-                print("multiprocessing done.")
 
             self.save_index_to_remove = flatten(results)
 
+        #cleanup memory and remove any no longer required variables
+        del results, args
+        #self._clear_cache(vars_to_delete=["results", "args"]) # this is not working as expected at the moment so need to manually delete the variables
         stop_extraction = timeit.default_timer()
 
         # calculate duration

From 2c1ea3cdff311c454b9a219441b3dd3b6b4797f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Thu, 19 Dec 2024 16:04:16 +0100
Subject: [PATCH 04/56] manually delete variables

---
 src/scportrait/pipeline/extraction.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/scportrait/pipeline/extraction.py b/src/scportrait/pipeline/extraction.py
index 9eb46647..592723a0 100644
--- a/src/scportrait/pipeline/extraction.py
+++ b/src/scportrait/pipeline/extraction.py
@@ -651,7 +651,8 @@ def _transfer_tempmmap_to_hdf5(self):
             )  # increase to 64 bit otherwise information may become truncated
 
             self.log("single-cell index created.")
-            self._clear_cache(vars_to_delete=[cell_ids])
+            del cell_ids
+            #self._clear_cache(vars_to_delete=[cell_ids]) # this is not working as expected so we will just delete the variable directly
 
             _, c, x, y = _tmp_single_cell_data.shape
             single_cell_data = hf.create_dataset(
@@ -668,7 +669,8 @@ def _transfer_tempmmap_to_hdf5(self):
                 single_cell_data[ix] = _tmp_single_cell_data[i]
 
             self.log("single-cell data created")
-            self._clear_cache(vars_to_delete=[single_cell_data])
+            del single_cell_data
+            #self._clear_cache(vars_to_delete=[single_cell_data]) # this is not working as expected so we will just delete the variable directly
 
             # also transfer labelled index to HDF5
             index_labelled = _tmp_single_cell_index[keep_index]
@@ -684,7 +686,8 @@ def _transfer_tempmmap_to_hdf5(self):
             hf.create_dataset("single_cell_index_labelled", data=index_labelled, chunks=None, dtype=dt)
 
             self.log("single-cell index labelled created.")
-            self._clear_cache(vars_to_delete=[index_labelled])
+            del index_labelled
+            #self._clear_cache(vars_to_delete=[index_labelled]) # this is not working as expected so we will just delete the variable directly
 
             hf.create_dataset(
                 "channel_information",
@@ -695,7 +698,9 @@ def _transfer_tempmmap_to_hdf5(self):
             self.log("channel information created.")
 
         # cleanup memory
-        self._clear_cache(vars_to_delete=[_tmp_single_cell_index, index_labelled])
+        del _tmp_single_cell_index
+        #self._clear_cache(vars_to_delete=[_tmp_single_cell_index]) # this is not working as expected so we will just delete the variable directly
+
         os.remove(self._tmp_single_cell_data_path)
         os.remove(self._tmp_single_cell_index_path)
 

From 80410363bb8f439e209b9d7d159252a064ba414e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Thu, 19 Dec 2024 16:04:34 +0100
Subject: [PATCH 05/56] properly display figure

---
 src/scportrait/pipeline/extraction.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/scportrait/pipeline/extraction.py b/src/scportrait/pipeline/extraction.py
index 592723a0..662b96f5 100644
--- a/src/scportrait/pipeline/extraction.py
+++ b/src/scportrait/pipeline/extraction.py
@@ -636,7 +636,7 @@ def _transfer_tempmmap_to_hdf5(self):
                     axs[i].imshow(img, vmin=0, vmax=1)
                     axs[i].axis("off")
                 fig.tight_layout()
-                fig.show()
+                plt.show(fig)
 
         self.log("Transferring extracted single cells to .hdf5")
 

From 9542b24be82b033b749c688170ad7e02799f4979 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Fri, 20 Dec 2024 17:22:49 +0100
Subject: [PATCH 06/56] standardize log output

---
 src/scportrait/pipeline/extraction.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/scportrait/pipeline/extraction.py b/src/scportrait/pipeline/extraction.py
index 662b96f5..afec8da8 100644
--- a/src/scportrait/pipeline/extraction.py
+++ b/src/scportrait/pipeline/extraction.py
@@ -813,7 +813,6 @@ def process(self, partial=False, n_cells=None, seed=42):
                 # directory where intermediate results should be saved
                 cache: "/mnt/temp/cache"
         """
-
         total_time_start = timeit.default_timer()
 
         start_setup = timeit.default_timer()
@@ -876,7 +875,7 @@ def process(self, partial=False, n_cells=None, seed=42):
 
             self.log("Running in single threaded mode.")
             results = []
-            for arg in tqdm(args):
+            for arg in tqdm(args, total = len(args), desc = "Processing cell batches"):
                 x = f(arg)
                 results.append(x)
         else:
@@ -919,7 +918,6 @@ def process(self, partial=False, n_cells=None, seed=42):
             self.DEFAULT_LOG_NAME = "processing.log"  # change log name back to default
 
         self._post_extraction_cleanup()
-
         total_time_stop = timeit.default_timer()
         total_time = total_time_stop - total_time_start
 

From bf0d2cdb20cffd094d2923faff269e41b0708e8b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Wed, 8 Jan 2025 16:20:57 +0100
Subject: [PATCH 07/56] update git submodule version

---
 examples/notebooks | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/notebooks b/examples/notebooks
index bd8f8f04..5a9b127f 160000
--- a/examples/notebooks
+++ b/examples/notebooks
@@ -1 +1 @@
-Subproject commit bd8f8f041aa02bd4d18f1be4655f9e7cc4dfa307
+Subproject commit 5a9b127f06a39d326931728a0cf9850848fca205

From 6a590919830db433b093134036c574bad80e1f63 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Wed, 8 Jan 2025 16:21:49 +0100
Subject: [PATCH 08/56] fix pre-commit issues

---
 src/scportrait/pipeline/_base.py      |  2 +-
 src/scportrait/pipeline/extraction.py | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/scportrait/pipeline/_base.py b/src/scportrait/pipeline/_base.py
index fcc50d97..4bc63d87 100644
--- a/src/scportrait/pipeline/_base.py
+++ b/src/scportrait/pipeline/_base.py
@@ -91,7 +91,7 @@ def _clean_log_file(self):
 
         if os.path.exists(log_file_path):
             os.remove(log_file_path)
-    
+
     # def _clear_cache(self, vars_to_delete=None):
     #     """Helper function to help clear memory usage. Mainly relevant for GPU based segmentations.
 
diff --git a/src/scportrait/pipeline/extraction.py b/src/scportrait/pipeline/extraction.py
index afec8da8..ad672ff3 100644
--- a/src/scportrait/pipeline/extraction.py
+++ b/src/scportrait/pipeline/extraction.py
@@ -652,7 +652,7 @@ def _transfer_tempmmap_to_hdf5(self):
 
             self.log("single-cell index created.")
             del cell_ids
-            #self._clear_cache(vars_to_delete=[cell_ids]) # this is not working as expected so we will just delete the variable directly
+            # self._clear_cache(vars_to_delete=[cell_ids]) # this is not working as expected so we will just delete the variable directly
 
             _, c, x, y = _tmp_single_cell_data.shape
             single_cell_data = hf.create_dataset(
@@ -670,7 +670,7 @@ def _transfer_tempmmap_to_hdf5(self):
 
             self.log("single-cell data created")
             del single_cell_data
-            #self._clear_cache(vars_to_delete=[single_cell_data]) # this is not working as expected so we will just delete the variable directly
+            # self._clear_cache(vars_to_delete=[single_cell_data]) # this is not working as expected so we will just delete the variable directly
 
             # also transfer labelled index to HDF5
             index_labelled = _tmp_single_cell_index[keep_index]
@@ -687,7 +687,7 @@ def _transfer_tempmmap_to_hdf5(self):
 
             self.log("single-cell index labelled created.")
             del index_labelled
-            #self._clear_cache(vars_to_delete=[index_labelled]) # this is not working as expected so we will just delete the variable directly
+            # self._clear_cache(vars_to_delete=[index_labelled]) # this is not working as expected so we will just delete the variable directly
 
             hf.create_dataset(
                 "channel_information",
@@ -699,7 +699,7 @@ def _transfer_tempmmap_to_hdf5(self):
 
         # cleanup memory
         del _tmp_single_cell_index
-        #self._clear_cache(vars_to_delete=[_tmp_single_cell_index]) # this is not working as expected so we will just delete the variable directly
+        # self._clear_cache(vars_to_delete=[_tmp_single_cell_index]) # this is not working as expected so we will just delete the variable directly
 
         os.remove(self._tmp_single_cell_data_path)
         os.remove(self._tmp_single_cell_index_path)
@@ -875,7 +875,7 @@ def process(self, partial=False, n_cells=None, seed=42):
 
             self.log("Running in single threaded mode.")
             results = []
-            for arg in tqdm(args, total = len(args), desc = "Processing cell batches"):
+            for arg in tqdm(args, total=len(args), desc="Processing cell batches"):
                 x = f(arg)
                 results.append(x)
         else:
@@ -899,9 +899,9 @@ def process(self, partial=False, n_cells=None, seed=42):
 
             self.save_index_to_remove = flatten(results)
 
-        #cleanup memory and remove any no longer required variables
+        # cleanup memory and remove any no longer required variables
         del results, args
-        #self._clear_cache(vars_to_delete=["results", "args"]) # this is not working as expected at the moment so need to manually delete the variables
+        # self._clear_cache(vars_to_delete=["results", "args"]) # this is not working as expected at the moment so need to manually delete the variables
         stop_extraction = timeit.default_timer()
 
         # calculate duration

From dbea03215333fadcab226bf12f81d234370d4538 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Wed, 8 Jan 2025 16:28:05 +0100
Subject: [PATCH 09/56] ensure up to date spatialdata version

---
 requirements.txt     | 2 +-
 requirements_dev.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index b12d8e47..dc723bac 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -29,7 +29,7 @@ torch
 pytorch-lightning
 torchvision
 
-spatialdata
+spatialdata>=0.2.0
 napari-spatialdata
 pyqt5
 lxml_html_clean
diff --git a/requirements_dev.txt b/requirements_dev.txt
index 088c9fbb..1333aaf0 100644
--- a/requirements_dev.txt
+++ b/requirements_dev.txt
@@ -29,7 +29,7 @@ torch
 pytorch-lightning
 torchvision
 
-spatialdata
+spatialdata>=0.2.0
 napari-spatialdata
 pyqt5
 lxml_html_clean

From e6b2148b36c37a6d322675c2700540ef80aed171 Mon Sep 17 00:00:00 2001
From: Niklas Schmacke <niklas@schmacke.bio>
Date: Thu, 9 Jan 2025 12:09:37 +0100
Subject: [PATCH 10/56] Added option to compress hdf5 with gzip

---
 src/scportrait/pipeline/extraction.py | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/src/scportrait/pipeline/extraction.py b/src/scportrait/pipeline/extraction.py
index afec8da8..615dacb3 100644
--- a/src/scportrait/pipeline/extraction.py
+++ b/src/scportrait/pipeline/extraction.py
@@ -57,7 +57,13 @@ def __init__(self, *args, **kwargs):
             self.overwrite_run_path = self.overwrite
 
     def _get_compression_type(self):
-        self.compression_type = "lzf" if self.compression else None
+        if (self.compression) or (self.compression == "lzf"):
+            self.compression_type = "lzf"
+            return self.compression_type
+        elif self.compression == "gzip":
+            self.compression_type = "gzip"
+            return self.compression_type
+        self.compression_type = None
         return self.compression_type
 
     def _check_config(self):
@@ -655,18 +661,25 @@ def _transfer_tempmmap_to_hdf5(self):
             #self._clear_cache(vars_to_delete=[cell_ids]) # this is not working as expected so we will just delete the variable directly
 
             _, c, x, y = _tmp_single_cell_data.shape
+            print(_tmp_single_cell_data.shape)
+            print(self.image_size)
+            print(keep_index.shape)
             single_cell_data = hf.create_dataset(
                 "single_cell_data",
                 shape=(len(keep_index), c, x, y),
                 chunks=(1, 1, self.image_size, self.image_size),
-                compression=self.compression_type,
+                # compression=self.compression_type,
+                compression='gzip', #was lzf, gzip works
                 dtype=np.float16,
+               # rdcc_nbytes=5242880000, # 5gb 1024 * 1024 * 5000
+               # rdcc_w0=1,
+               # rdcc_nslots=50000,
             )
 
             # populate dataset in loop to prevent loading of entire dataset into memory
             # this is required to process large datasets to not run into memory issues
             for ix, i in enumerate(keep_index):
-                single_cell_data[ix] = _tmp_single_cell_data[i]
+               single_cell_data[ix] = _tmp_single_cell_data[i]
 
             self.log("single-cell data created")
             del single_cell_data

From d53eab4bdfdda87c7766fced8da41fc0546d22cd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Thu, 9 Jan 2025 13:16:58 +0100
Subject: [PATCH 11/56] add workaround for sdata objects with multiple labels

supports some labels not being in the scportrait compatible format
---
 src/scportrait/pipeline/_utils/sdata_io.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/scportrait/pipeline/_utils/sdata_io.py b/src/scportrait/pipeline/_utils/sdata_io.py
index 47017a1c..89c480e0 100644
--- a/src/scportrait/pipeline/_utils/sdata_io.py
+++ b/src/scportrait/pipeline/_utils/sdata_io.py
@@ -71,10 +71,12 @@ def _read_sdata(self) -> SpatialData:
             _sdata = SpatialData()
             _sdata.write(self.sdata_path, overwrite=True)
 
+        allowed_labels = ["seg_all_nucleus", "seg_all_cytosol"]
         for key in _sdata.labels:
-            segmentation_object = _sdata.labels[key]
-            if not hasattr(segmentation_object.attrs, "cell_ids"):
-                segmentation_object = spLabels2DModel().convert(segmentation_object, classes=None)
+            if key in allowed_labels:
+                segmentation_object = _sdata.labels[key]
+                if not hasattr(segmentation_object.attrs, "cell_ids"):
+                    segmentation_object = spLabels2DModel().convert(segmentation_object, classes=None)
 
         return _sdata
 

From 1a2a1b2b59d032b41a7e161e56b7318ba6725731 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Thu, 9 Jan 2025 14:23:42 +0100
Subject: [PATCH 12/56] initial implementation to explicitly specify
 segmentation masks to be used for extraction

---
 src/scportrait/pipeline/extraction.py | 68 ++++++++++++++++++---------
 1 file changed, 45 insertions(+), 23 deletions(-)

diff --git a/src/scportrait/pipeline/extraction.py b/src/scportrait/pipeline/extraction.py
index 0b83ead3..53473b32 100644
--- a/src/scportrait/pipeline/extraction.py
+++ b/src/scportrait/pipeline/extraction.py
@@ -267,24 +267,53 @@ def _get_segmentation_info(self):
                 f"Found no segmentation masks with key {self.segmentation_key}. Cannot proceed with extraction."
             )
 
-        # get relevant segmentation masks to perform extraction on
-        nucleus_key = f"{self.segmentation_key}_nucleus"
+        #intialize default values to track what should be extracted
+        self.nucleus_key = None
+        self.cytosol_key = None
+        self.extract_nucleus_mask = False
+        self.extract_cytosol_mask = False
+
+        if "segmentation_mask" in self.config:
+            allowed_mask_values = ["nucleus", "cytosol"]
+            allowed_mask_values = [f"{self.segmentation_key}_{x}" for x in allowed_mask_values]
+
+            if isinstance(self.config["segmentation_mask"], str):
+                assert (self.config["segmentation_mask"] in allowed_mask_values)
+
+                if "nucleus" in self.main_segmenation_mask:
+                    self.nucleus_key = self.main_segmenation_mask
+                    self.extract_nucleus_mask = True
+
+                elif "cytosol" in self.main_segmenation_mask:
+                    self.cytosol_key = self.main_segmenation_mask
+                    self.extract_cytosol_mask = True
+                else:
+                    raise ValueError(f"Segmentation mask {self.main_segmenation_mask} is not a valid mask to extract from.")
 
-        if nucleus_key in relevant_masks:
-            self.extract_nucleus_mask = True
-            self.nucleus_key = nucleus_key
-        else:
-            self.extract_nucleus_mask = False
-            self.nucleus_key = None
+            elif isinstance(self.config["segmentation_mask"], list):
+                assert all(x in allowed_mask_values for x in self.config["segmentation_mask"])
 
-        cytosol_key = f"{self.segmentation_key}_cytosol"
+                for x in self.config["segmentation_mask"]:
+                    if "nucleus" in x:
+                        self.nucleus_key = x
+                        self.extract_nucleus_mask = True
+                    if "cytosol" in x:
+                        self.cytosol_key = x
+                        self.extract_cytosol_mask = True
 
-        if cytosol_key in relevant_masks:
-            self.extract_cytosol_mask = True
-            self.cytosol_key = cytosol_key
         else:
-            self.extract_cytosol_mask = False
-            self.cytosol_key = None
+            # get relevant segmentation masks to perform extraction on
+            nucleus_key = f"{self.segmentation_key}_nucleus"
+
+            if nucleus_key in relevant_masks:
+                self.extract_nucleus_mask = True
+                self.nucleus_key = nucleus_key
+
+            cytosol_key = f"{self.segmentation_key}_cytosol"
+
+            if cytosol_key in relevant_masks:
+                self.extract_cytosol_mask = True
+                self.cytosol_key = cytosol_key
 
         self.n_masks = np.sum([self.extract_nucleus_mask, self.extract_cytosol_mask])
         self.masks = [x for x in [self.nucleus_key, self.cytosol_key] if x is not None]
@@ -661,25 +690,18 @@ def _transfer_tempmmap_to_hdf5(self):
             # self._clear_cache(vars_to_delete=[cell_ids]) # this is not working as expected so we will just delete the variable directly
 
             _, c, x, y = _tmp_single_cell_data.shape
-            print(_tmp_single_cell_data.shape)
-            print(self.image_size)
-            print(keep_index.shape)
             single_cell_data = hf.create_dataset(
                 "single_cell_data",
                 shape=(len(keep_index), c, x, y),
                 chunks=(1, 1, self.image_size, self.image_size),
-                # compression=self.compression_type,
-                compression='gzip', #was lzf, gzip works
+                compression=self.compression_type,
                 dtype=np.float16,
-               # rdcc_nbytes=5242880000, # 5gb 1024 * 1024 * 5000
-               # rdcc_w0=1,
-               # rdcc_nslots=50000,
             )
 
             # populate dataset in loop to prevent loading of entire dataset into memory
             # this is required to process large datasets to not run into memory issues
             for ix, i in enumerate(keep_index):
-               single_cell_data[ix] = _tmp_single_cell_data[i]
+                single_cell_data[ix] = _tmp_single_cell_data[i]
 
             self.log("single-cell data created")
             del single_cell_data

From f7d2c341c6465dc808267fbee5ee4be859df7e2f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Thu, 9 Jan 2025 14:29:11 +0100
Subject: [PATCH 13/56] fix bug incorrectly saved mask names

---
 src/scportrait/pipeline/extraction.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/scportrait/pipeline/extraction.py b/src/scportrait/pipeline/extraction.py
index 53473b32..ab8a6de5 100644
--- a/src/scportrait/pipeline/extraction.py
+++ b/src/scportrait/pipeline/extraction.py
@@ -280,15 +280,15 @@ def _get_segmentation_info(self):
             if isinstance(self.config["segmentation_mask"], str):
                 assert (self.config["segmentation_mask"] in allowed_mask_values)
 
-                if "nucleus" in self.main_segmenation_mask:
-                    self.nucleus_key = self.main_segmenation_mask
+                if "nucleus" in self.config["segmentation_mask"]:
+                    self.nucleus_key = self.config["segmentation_mask"]
                     self.extract_nucleus_mask = True
 
-                elif "cytosol" in self.main_segmenation_mask:
-                    self.cytosol_key = self.main_segmenation_mask
+                elif "cytosol" in self.config["segmentation_mask"]:
+                    self.cytosol_key = self.config["segmentation_mask"]
                     self.extract_cytosol_mask = True
                 else:
-                    raise ValueError(f"Segmentation mask {self.main_segmenation_mask} is not a valid mask to extract from.")
+                    raise ValueError(f"Segmentation mask {self.config['segmentation_mask']} is not a valid mask to extract from.")
 
             elif isinstance(self.config["segmentation_mask"], list):
                 assert all(x in allowed_mask_values for x in self.config["segmentation_mask"])

From 7da9a873b6b07d906a6a76fb5a4db4783c03ddf6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Thu, 9 Jan 2025 14:50:07 +0100
Subject: [PATCH 14/56] fix precommit issues

---
 src/scportrait/pipeline/extraction.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/scportrait/pipeline/extraction.py b/src/scportrait/pipeline/extraction.py
index ab8a6de5..1060dc0c 100644
--- a/src/scportrait/pipeline/extraction.py
+++ b/src/scportrait/pipeline/extraction.py
@@ -267,7 +267,7 @@ def _get_segmentation_info(self):
                 f"Found no segmentation masks with key {self.segmentation_key}. Cannot proceed with extraction."
             )
 
-        #intialize default values to track what should be extracted
+        # intialize default values to track what should be extracted
         self.nucleus_key = None
         self.cytosol_key = None
         self.extract_nucleus_mask = False
@@ -278,7 +278,7 @@ def _get_segmentation_info(self):
             allowed_mask_values = [f"{self.segmentation_key}_{x}" for x in allowed_mask_values]
 
             if isinstance(self.config["segmentation_mask"], str):
-                assert (self.config["segmentation_mask"] in allowed_mask_values)
+                assert self.config["segmentation_mask"] in allowed_mask_values
 
                 if "nucleus" in self.config["segmentation_mask"]:
                     self.nucleus_key = self.config["segmentation_mask"]
@@ -288,7 +288,9 @@ def _get_segmentation_info(self):
                     self.cytosol_key = self.config["segmentation_mask"]
                     self.extract_cytosol_mask = True
                 else:
-                    raise ValueError(f"Segmentation mask {self.config['segmentation_mask']} is not a valid mask to extract from.")
+                    raise ValueError(
+                        f"Segmentation mask {self.config['segmentation_mask']} is not a valid mask to extract from."
+                    )
 
             elif isinstance(self.config["segmentation_mask"], list):
                 assert all(x in allowed_mask_values for x in self.config["segmentation_mask"])

From d4663d8d759a9a3bed04118f980c4cafcb4e7226 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Sat, 11 Jan 2025 12:45:15 +0100
Subject: [PATCH 15/56] Fix typo in file naming

---
 src/scportrait/pipeline/_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/scportrait/pipeline/_base.py b/src/scportrait/pipeline/_base.py
index 4bc63d87..fdca9b64 100644
--- a/src/scportrait/pipeline/_base.py
+++ b/src/scportrait/pipeline/_base.py
@@ -158,7 +158,7 @@ class ProcessingStep(Logable):
     DEFAULT_SEGMENTATION_DIR_NAME = "segmentation"
     DEFAULT_TILES_FOLDER = "tiles"
 
-    DEFAULT_EXTRACTIN_DIR_NAME = "extraction"
+    DEFAULT_EXTRACTION_DIR_NAME = "extraction"
     DEFAULT_DATA_DIR = "data"
 
     DEFAULT_IMAGE_DTYPE = np.uint16

From fbac9fe0f0e477b269a9d5a93b0b7714fb42fb26 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Sat, 11 Jan 2025 12:45:30 +0100
Subject: [PATCH 16/56] relocate removed classes file to extraction directory

---
 src/scportrait/pipeline/extraction.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/scportrait/pipeline/extraction.py b/src/scportrait/pipeline/extraction.py
index 1060dc0c..e7cddf01 100644
--- a/src/scportrait/pipeline/extraction.py
+++ b/src/scportrait/pipeline/extraction.py
@@ -450,9 +450,10 @@ def _get_label_info(self, arg):
 
     def _save_removed_classes(self, classes):
         # define path where classes should be saved
+
         filtered_path = os.path.join(
             self.project_location,
-            self.DEFAULT_SEGMENTATION_DIR_NAME,
+            self.DEFAULT_EXTRACTION_DIR_NAME,
             self.DEFAULT_REMOVED_CLASSES_FILE,
         )
 

From 22bd181e84dbb576d2ffa0eaabd5363ed0d2148a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Sat, 11 Jan 2025 12:47:21 +0100
Subject: [PATCH 17/56] ruff linting

---
 src/scportrait/pipeline/extraction.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/scportrait/pipeline/extraction.py b/src/scportrait/pipeline/extraction.py
index e7cddf01..87c6b46c 100644
--- a/src/scportrait/pipeline/extraction.py
+++ b/src/scportrait/pipeline/extraction.py
@@ -450,7 +450,6 @@ def _get_label_info(self, arg):
 
     def _save_removed_classes(self, classes):
         # define path where classes should be saved
-
         filtered_path = os.path.join(
             self.project_location,
             self.DEFAULT_EXTRACTION_DIR_NAME,

From d922f3ed6b20066b159c8bf5af7511108ae3361d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Wed, 15 Jan 2025 13:34:13 +0100
Subject: [PATCH 18/56] ensure angles are always stored as a list

---
 src/scportrait/tools/ml/transforms.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/scportrait/tools/ml/transforms.py b/src/scportrait/tools/ml/transforms.py
index b9769895..08baba1c 100644
--- a/src/scportrait/tools/ml/transforms.py
+++ b/src/scportrait/tools/ml/transforms.py
@@ -19,7 +19,7 @@ def __init__(self, choices=4, include_zero=True):
             delta = (360 - angles[-1]) / 2
             angles = angles + delta
 
-        self.choices = angles
+        self.choices = angles.tolist()
 
     def __call__(self, tensor):
         angle = random.choice(self.choices)

From a2419bb3078c8ba13f38df1911827fdf1c29cab7 Mon Sep 17 00:00:00 2001
From: Niklas Schmacke <niklas@schmacke.bio>
Date: Fri, 17 Jan 2025 19:09:18 +0100
Subject: [PATCH 19/56] Fixed gzip compression

---
 src/scportrait/pipeline/extraction.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/scportrait/pipeline/extraction.py b/src/scportrait/pipeline/extraction.py
index 87c6b46c..1d032ebe 100644
--- a/src/scportrait/pipeline/extraction.py
+++ b/src/scportrait/pipeline/extraction.py
@@ -57,13 +57,13 @@ def __init__(self, *args, **kwargs):
             self.overwrite_run_path = self.overwrite
 
     def _get_compression_type(self):
-        if (self.compression) or (self.compression == "lzf"):
+        if (self.compression == True) or (self.compression == "lzf"):
             self.compression_type = "lzf"
-            return self.compression_type
         elif self.compression == "gzip":
             self.compression_type = "gzip"
-            return self.compression_type
-        self.compression_type = None
+        else:
+            self.compression_type = None
+        self.log(f"Compression algorithm: {self.compression_type}")
         return self.compression_type
 
     def _check_config(self):

From 144025f46458013578f4c536bef77e683e66d977 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Fri, 17 Jan 2025 19:25:29 +0100
Subject: [PATCH 20/56] improve behaviour of mask matching behaviour for
 deprecated config keys

will use the default parameters as if nothing was specified
---
 src/scportrait/pipeline/segmentation/workflows.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/scportrait/pipeline/segmentation/workflows.py b/src/scportrait/pipeline/segmentation/workflows.py
index 008a2729..d5f2614d 100644
--- a/src/scportrait/pipeline/segmentation/workflows.py
+++ b/src/scportrait/pipeline/segmentation/workflows.py
@@ -653,7 +653,9 @@ def _check_for_mask_matching_filtering(self) -> None:
         else:
             # add deprecation warning for old config setup
             if "filter_status" in self.config.keys():
-                Warning("filter_status is deprecated, please use match_masks instead Will not perform filtering.")
+                self.filter_match_masks = True
+                self.mask_matching_filtering_threshold = 0.95
+                Warning("filter_status is deprecated, please use match_masks instead. Will use default settings for mask matching.")
 
             # default behaviour that this filtering should be performed, otherwise another additional step is required before extraction
             self.filter_match_masks = True

From 696ea5e97f9e96371f2ea7478397a18dde7a1500 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Fri, 17 Jan 2025 19:29:57 +0100
Subject: [PATCH 21/56] only check loaded segmentation masks if available

---
 src/scportrait/pipeline/project.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/scportrait/pipeline/project.py b/src/scportrait/pipeline/project.py
index ae20d64c..864195d0 100644
--- a/src/scportrait/pipeline/project.py
+++ b/src/scportrait/pipeline/project.py
@@ -888,9 +888,10 @@ def load_input_from_sdata(
         # ensure that the provided nucleus and cytosol segmentations fullfill the scPortrait requirements
         # requirements are:
         # 1. The nucleus segmentation mask and the cytosol segmentation mask must contain the same ids
-        assert (
-            self.sdata[self.nuc_seg_name].attrs["cell_ids"] == self.sdata[self.cyto_seg_name].attrs["cell_ids"]
-        ), "The nucleus segmentation mask and the cytosol segmentation mask must contain the same ids."
+        if self.nuc_seg_status in self.sdata.keys() and self.cyto_seg_status in self.sdata.keys():
+            assert (
+                self.sdata[self.nuc_seg_name].attrs["cell_ids"] == self.sdata[self.cyto_seg_name].attrs["cell_ids"]
+            ), "The nucleus segmentation mask and the cytosol segmentation mask must contain the same ids."
 
         # 2. the nucleus segmentation ids and the cytosol segmentation ids need to match
         # THIS NEEDS TO BE IMPLEMENTED HERE

From 622a45f7b925687f2467c1ab2f00125f4131b4ce Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Fri, 17 Jan 2025 19:33:03 +0100
Subject: [PATCH 22/56] ruff linting

---
 src/scportrait/pipeline/extraction.py             | 2 +-
 src/scportrait/pipeline/segmentation/workflows.py | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/scportrait/pipeline/extraction.py b/src/scportrait/pipeline/extraction.py
index 1d032ebe..505f6081 100644
--- a/src/scportrait/pipeline/extraction.py
+++ b/src/scportrait/pipeline/extraction.py
@@ -57,7 +57,7 @@ def __init__(self, *args, **kwargs):
             self.overwrite_run_path = self.overwrite
 
     def _get_compression_type(self):
-        if (self.compression == True) or (self.compression == "lzf"):
+        if (self.compression is True) or (self.compression == "lzf"):
             self.compression_type = "lzf"
         elif self.compression == "gzip":
             self.compression_type = "gzip"
diff --git a/src/scportrait/pipeline/segmentation/workflows.py b/src/scportrait/pipeline/segmentation/workflows.py
index d5f2614d..94ed8c44 100644
--- a/src/scportrait/pipeline/segmentation/workflows.py
+++ b/src/scportrait/pipeline/segmentation/workflows.py
@@ -655,7 +655,9 @@ def _check_for_mask_matching_filtering(self) -> None:
             if "filter_status" in self.config.keys():
                 self.filter_match_masks = True
                 self.mask_matching_filtering_threshold = 0.95
-                Warning("filter_status is deprecated, please use match_masks instead. Will use default settings for mask matching.")
+                Warning(
+                    "filter_status is deprecated, please use match_masks instead. Will use default settings for mask matching."
+                )
 
             # default behaviour that this filtering should be performed, otherwise another additional step is required before extraction
             self.filter_match_masks = True

From e34129ebbb7b8c0a0d586208408065c251e323fd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Sat, 18 Jan 2025 14:25:46 +0100
Subject: [PATCH 23/56] remove unnecessary print statement

---
 src/scportrait/pipeline/segmentation/segmentation.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/scportrait/pipeline/segmentation/segmentation.py b/src/scportrait/pipeline/segmentation/segmentation.py
index 9b2edcd9..5d8a0f9a 100644
--- a/src/scportrait/pipeline/segmentation/segmentation.py
+++ b/src/scportrait/pipeline/segmentation/segmentation.py
@@ -742,7 +742,6 @@ def _resolve_sharding(self, sharding_plan):
             local_hf = h5py.File(local_output, "r")
             local_hdf_labels = local_hf.get(self.DEFAULT_MASK_NAME)[:]
 
-            print(type(local_hdf_labels))
             shifted_map, edge_labels = shift_labels(
                 local_hdf_labels,
                 class_id_shift,

From 43f4a9814b6dc7031b247902fa0efe683fe03f9e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Sat, 18 Jan 2025 15:55:06 +0100
Subject: [PATCH 24/56] fix multiprocessing worker naming issue when multiple
 runs are done sequentially

---
 src/scportrait/pipeline/segmentation/segmentation.py | 5 +++--
 src/scportrait/pipeline/segmentation/workflows.py    | 4 ++++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/scportrait/pipeline/segmentation/segmentation.py b/src/scportrait/pipeline/segmentation/segmentation.py
index 5d8a0f9a..63d62490 100644
--- a/src/scportrait/pipeline/segmentation/segmentation.py
+++ b/src/scportrait/pipeline/segmentation/segmentation.py
@@ -901,8 +901,9 @@ def _resolve_sharding(self, sharding_plan):
         if not self.deep_debug:
             self._cleanup_shards(sharding_plan)
 
-    def _initializer_function(self, gpu_id_list):
+    def _initializer_function(self, gpu_id_list, n_processes):
         current_process().gpu_id_list = gpu_id_list
+        current_process().n_processes = n_processes
 
     def _perform_segmentation(self, shard_list):
         # get GPU status
@@ -920,7 +921,7 @@ def _perform_segmentation(self, shard_list):
             with mp.get_context(self.context).Pool(
                 processes=self.n_processes,
                 initializer=self._initializer_function,
-                initargs=[self.gpu_id_list],
+                initargs=[self.gpu_id_list, self.n_processes],
             ) as pool:
                 list(
                     tqdm(
diff --git a/src/scportrait/pipeline/segmentation/workflows.py b/src/scportrait/pipeline/segmentation/workflows.py
index 94ed8c44..2ad5d0cb 100644
--- a/src/scportrait/pipeline/segmentation/workflows.py
+++ b/src/scportrait/pipeline/segmentation/workflows.py
@@ -15,6 +15,7 @@
 from skimage.filters import median
 from skimage.morphology import binary_erosion, dilation, disk, erosion
 from skimage.segmentation import watershed
+import _multiprocessing
 
 from scportrait.pipeline._utils.segmentation import (
     contact_filter,
@@ -1353,6 +1354,9 @@ def _check_gpu_status(self):
             gpu_id_list = current.gpu_id_list
             cpu_id = int(cpu_name[cpu_name.find("-") + 1 :]) - 1
 
+            if cpu_id >= len(gpu_id_list):
+                cpu_id = cpu_id%current.n_processes
+
             # track gpu_id and update GPU status
             self.gpu_id = gpu_id_list[cpu_id]
             self.status = "multi_GPU"

From 419c3454a7f1e9f5c27d719c80f557c03607a5b7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Sat, 18 Jan 2025 19:03:38 +0100
Subject: [PATCH 25/56] ruff linting

---
 src/scportrait/pipeline/segmentation/workflows.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/scportrait/pipeline/segmentation/workflows.py b/src/scportrait/pipeline/segmentation/workflows.py
index 2ad5d0cb..b677597f 100644
--- a/src/scportrait/pipeline/segmentation/workflows.py
+++ b/src/scportrait/pipeline/segmentation/workflows.py
@@ -15,7 +15,6 @@
 from skimage.filters import median
 from skimage.morphology import binary_erosion, dilation, disk, erosion
 from skimage.segmentation import watershed
-import _multiprocessing
 
 from scportrait.pipeline._utils.segmentation import (
     contact_filter,
@@ -1355,7 +1354,7 @@ def _check_gpu_status(self):
             cpu_id = int(cpu_name[cpu_name.find("-") + 1 :]) - 1
 
             if cpu_id >= len(gpu_id_list):
-                cpu_id = cpu_id%current.n_processes
+                cpu_id = cpu_id % current.n_processes
 
             # track gpu_id and update GPU status
             self.gpu_id = gpu_id_list[cpu_id]

From 405ea46fa390fccd3c963f0a544c116a533551d2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Thu, 23 Jan 2025 16:47:32 +0100
Subject: [PATCH 26/56] relocate import statements that require java to lazy
 import statement

this allows you to not have a working java installation to import and work with scportrait as long as you do not require the stitching capabilities
---
 src/scportrait/tools/stitch/_stitch.py | 39 +++++++++++++++++---------
 1 file changed, 25 insertions(+), 14 deletions(-)

diff --git a/src/scportrait/tools/stitch/_stitch.py b/src/scportrait/tools/stitch/_stitch.py
index 5b21148d..90ead29f 100644
--- a/src/scportrait/tools/stitch/_stitch.py
+++ b/src/scportrait/tools/stitch/_stitch.py
@@ -21,13 +21,7 @@
 from scportrait.io.daskmmap import dask_array_from_path
 from scportrait.processing.images._image_processing import rescale_image
 from scportrait.tools.stitch._utils.ashlar_plotting import plot_edge_quality, plot_edge_scatter
-from scportrait.tools.stitch._utils.filereaders import (
-    BioformatsReaderRescale,
-    FilePatternReaderRescale,
-)
 from scportrait.tools.stitch._utils.filewriters import write_ome_zarr, write_spatialdata, write_tif, write_xml
-from scportrait.tools.stitch._utils.parallelized_ashlar import ParallelEdgeAligner, ParallelMosaic
-
 
 class Stitcher:
     """
@@ -65,7 +59,7 @@ def __init__(
         do_intensity_rescale: bool | str = True,
         rescale_range: tuple = (1, 99),
         channel_order: list[str] = None,
-        reader_type=FilePatternReaderRescale,
+        reader_type="FilePatternReaderRescale",
         orientation: dict = None,
         plot_QC: bool = True,
         overwrite: bool = False,
@@ -112,8 +106,13 @@ def __init__(
         """
         self._lazy_imports()
 
+        # workaround for lazy imports of module
+        if self.reader_type == "FilePatternReaderRescale":
+            self.reader_type = self.FilePatternReaderRescale
+
         if orientation is None:
             orientation = {"flip_x": False, "flip_y": True}
+
         self.input_dir = input_dir
         self.slidename = slidename
         self.outdir = outdir
@@ -158,10 +157,21 @@ def _lazy_imports(self):
         from ashlar.reg import EdgeAligner, Mosaic
         from ashlar.scripts.ashlar import process_axis_flip
 
+        from scportrait.tools.stitch._utils.filereaders import (
+            BioformatsReaderRescale,
+            FilePatternReaderRescale,
+        )
+
+        from scportrait.tools.stitch._utils.parallelized_ashlar import ParallelEdgeAligner, ParallelMosaic
+
         self.ashlar_thumbnail = thumbnail
         self.ashlar_EdgeAligner = EdgeAligner
         self.ashlar_Mosaic = Mosaic
         self.ashlar_process_axis_flip = process_axis_flip
+        self.BioformatsReaderRescale = BioformatsReaderRescale
+        self.FilePatternReaderRescale = FilePatternReaderRescale
+        self.ParallelEdgeAligner = ParallelEdgeAligner
+        self.ParallelMosaic = ParallelMosaic
 
     def __exit__(self):
         self._clear_cache()
@@ -294,14 +304,14 @@ def _initialize_reader(self):
         """
         Initialize the reader for reading image tiles.
         """
-        if self.reader_type == FilePatternReaderRescale:
+        if self.reader_type == self.FilePatternReaderRescale:
             self.reader = self.reader_type(
                 self.input_dir,
                 self.pattern,
                 self.overlap,
                 rescale_range=self.rescale_range,
             )
-        elif self.reader_type == BioformatsReaderRescale:
+        elif self.reader_type == self.BioformatsReaderRescale:
             self.reader = self.reader_type(self.input_dir, rescale_range=self.rescale_range)
 
         # setup correct orientation of slide (this depends on microscope used to generate the data)
@@ -564,7 +574,7 @@ class ParallelStitcher(Stitcher):
         do_intensity_rescale (bool or "full_image", optional): Flag to indicate whether to rescale image intensities (default is True). Alternatively, set to "full_image" to rescale the entire image.
         rescale_range (tuple or dict, optional): If all channels should be rescaled to the same range pass a tuple with the percentiles for rescaling (default is (1, 99)). Alternatively, a dictionary can be passed with the channel names as keys and the percentiles as values if each channel should be rescaled to a different range.
         channel_order (list, optional): Order of channels in the generated output mosaic. If none (default value) the order of the channels is left unchanged.
-        reader_type (class, optional): Type of reader to use for reading image tiles (default is FilePatternReaderRescale).
+        reader_type (class, optional): Type of reader to use for reading image tiles (default is "FilePatternReaderRescale").
         orientation (dict, optional): Dictionary specifying which dimensions of the slide to flip (default is {'flip_x': False, 'flip_y': True}).
         plot_QC (bool, optional): Flag to indicate whether to plot quality control (QC) figures (default is True).
         overwrite (bool, optional): Flag to indicate whether to overwrite the output directory if it already exists (default is False).
@@ -588,7 +598,7 @@ def __init__(
         WGAchannel: str = None,
         channel_order: list[str] = None,
         overwrite: bool = False,
-        reader_type=FilePatternReaderRescale,
+        reader_type="FilePatternReaderRescale",
         orientation=None,
         cache: str = None,
         threads: int = 20,
@@ -613,8 +623,9 @@ def __init__(
             overwrite,
             cache,
         )
+
         # dirty fix to avoide multithreading error with BioformatsReader until this can be fixed
-        if self.reader_type == BioformatsReaderRescale:
+        if self.reader_type == self.BioformatsReaderRescale:
             threads = 1
             print(
                 "BioformatsReaderRescale does not support multithreading for calculating the error threshold currently. Proceeding with 1 thread."
@@ -632,7 +643,7 @@ def _initialize_aligner(self):
         Returns:
             aligner (ParallelEdgeAligner): Initialized ParallelEdgeAligner object.
         """
-        aligner = ParallelEdgeAligner(
+        aligner = self.ParallelEdgeAligner(
             self.reader,
             channel=self.stitching_channel_id,
             filter_sigma=self.filter_sigma,
@@ -644,7 +655,7 @@ def _initialize_aligner(self):
         return aligner
 
     def _initialize_mosaic(self):
-        mosaic = ParallelMosaic(
+        mosaic =self.ParallelMosaic(
             self.aligner, self.aligner.mosaic_shape, verbose=True, channels=self.channels, n_threads=self.threads
         )
         return mosaic

From e3fbd309976f7343bf3af73c43269c6b5a883805 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Sat, 25 Jan 2025 20:10:44 +0100
Subject: [PATCH 27/56] adapt selection workflow to work with new py-lmd
 version + improves selection performance even for large datasets

see https://github.com/MannLabs/py-lmd/pull/11 for more information
---
 src/scportrait/pipeline/selection.py | 195 +++++++++++++++++++++++----
 1 file changed, 168 insertions(+), 27 deletions(-)

diff --git a/src/scportrait/pipeline/selection.py b/src/scportrait/pipeline/selection.py
index 0afb36a8..cc26c30c 100644
--- a/src/scportrait/pipeline/selection.py
+++ b/src/scportrait/pipeline/selection.py
@@ -4,8 +4,19 @@
 from alphabase.io import tempmmap
 from lmd.lib import SegmentationLoader
 
+import h5py
+import timeit
+import pandas as pd
+import pickle
+from scipy.sparse import coo_array
+from tqdm.auto import tqdm
+from functools import partial as func_partial
+import multiprocessing as mp
+
 from scportrait.pipeline._base import ProcessingStep
+from scportrait.pipeline._utils.helper import flatten
 
+import matplotlib.pyplot as plt
 
 class LMDSelection(ProcessingStep):
     """
@@ -13,19 +24,58 @@ class LMDSelection(ProcessingStep):
     This method class relies on the functionality of the pylmd library.
     """
 
-    # define all valid path optimization methods used with the "path_optimization" argument in the configuration
-    VALID_PATH_OPTIMIZERS = ["none", "hilbert", "greedy"]
-
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
+        self._check_config()
 
         self.name = None
         self.cell_sets = None
         self.calibration_marker = None
 
+        self.deep_debug = False #flag for deep debugging by developers
+
+    def _check_config(self):
+        assert "segmentation_channel" in self.config, "segmentation_channel not defined in config"
+        self.segmentation_channel_to_select = self.config["segmentation_channel"]
+
+        # check for optional config parameters
+
+        #this defines how large the box mask around the center of a cell is for the coordinate extraction
+        #assumption is that all pixels belonging to each mask are within the box otherwise they will be cut off during cutting contour generation
+
+        if "cell_width" in self.config:
+            self.cell_radius = self.config["cell_width"]
+        else:
+            self.cell_radius = 100
+
+        if "threads" in self.config:
+            self.threads = self.config["threads"]
+            assert self.threads > 0, "threads must be greater than 0"
+            assert isinstance(self.threads, int), "threads must be an integer"
+        else:
+            self.threads = 10
+
+        if "batch_size_coordinate_extraction" in self.config:
+            self.batch_size = self.config["batch_size_coordinate_extraction"]
+            assert self.batch_size > 0, "batch_size_coordinate_extraction must be greater than 0"
+            assert isinstance(self.batch_size, int), "batch_size_coordinate_extraction must be an integer"
+        else:
+            self.batch_size = 100
+
+        if "orientation_transform" in self.config:
+            self.orientation_transform = self.config["orientation_transform"]
+        else:
+            self.orientation_transform = np.array([[0, -1], [1, 0]])
+            self.config["orientation_transform"] = self.orientation_transform #ensure its also in config so its passed on to the segmentation loader
+
+        if "processes_cell_sets" in self.config:
+            self.processes_cell_sets = self.config["processes_cell_sets"]
+            assert self.processes_cell_sets > 0, "processes_cell_sets must be greater than 0"
+            assert isinstance(self.processes_cell_sets, int), "processes_cell_sets must be an integer"
+        else:
+            self.processes_cell_sets = 1
+
     def _setup_selection(self):
-        # set orientation transform
-        self.config["orientation_transform"] = np.array([[0, -1], [1, 0]])
 
         # configure name of extraction
         if self.name is None:
@@ -39,6 +89,102 @@ def _setup_selection(self):
         savename = name.replace(" ", "_") + ".xml"
         self.savepath = os.path.join(self.directory, savename)
 
+        #check that the segmentation label exists
+        assert self.segmentation_channel_to_select in self.project.filehandler.get_sdata()._shared_keys, f"Segmentation channel {self.segmentation_channel_to_select} not found in sdata."
+
+    def __get_coords(self,
+                     cell_ids: list,
+                     centers:list[tuple[int, int]],
+                     width:int = 60) -> list[tuple[int, np.ndarray]]:
+        results = []
+
+        _sdata = self.project.filehandler.get_sdata()
+        for i, _id in enumerate(cell_ids):
+            values = centers[i]
+
+            x_start = np.max([int(values[0]) - width, 0])
+            y_start = np.max([int(values[1]) - width, 0])
+
+            x_end = x_start + width*2
+            y_end = y_start + width*2
+
+            _cropped = _sdata[self.segmentation_channel_to_select][slice(x_start, x_end), slice(y_start, y_end)].compute()
+
+            #optional plotting output for deep debugging
+            if self.deep_debug:
+                if self.threads == 1:
+                    plt.figure()
+                    plt.imshow(_cropped)
+                    plt.show()
+                else:
+                    raise ValueError("Deep debug is not supported with multiple threads.")
+
+            sparse = coo_array(_cropped == _id)
+
+            if 0 in sparse:
+                Warning(f"Cell {i} with id {_id} is potentially not fully contained in the bounding mask. Consider increasing the value for the 'cell_width' parameter in your config.")
+
+            x = sparse.coords[0] + x_start
+            y = sparse.coords[1] + y_start
+
+            results.append((_id, np.array(list(zip(x, y, strict = True)))))
+
+        return(results)
+
+    def _get_coords_multi(self, width:int, arg: tuple[list[int], np.ndarray]) -> list[tuple[int, np.ndarray]]:
+        cell_ids, centers = arg
+        results = self.__get_coords(cell_ids, centers, width)
+        return(results)
+
+    def _get_coords(self,
+                    cell_ids: list,
+                    centers:list[tuple[int, int]],
+                    width:int = 60,
+                    batch_size:int = 100,
+                    threads:int = 10) -> dict:
+
+        #create batches
+        n_batches = int(np.ceil(len(cell_ids)/batch_size))
+        slices = [(i*batch_size, i*batch_size + batch_size) for i in range(n_batches - 1)]
+        slices.append(((n_batches - 1)*batch_size, len(cell_ids)))
+
+        batched_args = [(cell_ids[start:end], centers[start:end]) for start, end in slices]
+
+        f = func_partial(self._get_coords_multi,
+                        width
+            )
+
+        if threads == 1: # if only one thread is used, the function is called directly to avoid the overhead of multiprocessing
+            results = [f(arg) for arg in batched_args]
+        else:
+            with mp.get_context(self.context).Pool(processes=threads) as pool: 
+                results = list(tqdm(
+                        pool.imap(f, batched_args),
+                        total=len(batched_args),
+                        desc="Processing cell batches",
+                    )
+                )
+                pool.close()
+                pool.join()
+
+        results = flatten(results)
+        return(dict(results))
+
+    def _get_cell_ids(self, cell_sets: list[dict]) -> list[int]:
+        cell_ids = []
+        for cell_set in cell_sets:
+            if "classes" in cell_set:
+                cell_ids.extend(cell_set["classes"])
+            else:
+                Warning(f"Cell set {cell_set['name']} does not contain any classes.")
+        return(cell_ids)
+
+    def _get_centers(self, cell_ids: list[int]) -> list[tuple[int, int]]:
+        _sdata = self.project.filehandler.get_sdata()
+        centers = _sdata["centers_cells"].compute()
+        centers = centers.loc[cell_ids, :]
+        return(centers[["y", "x"]].values.tolist()) #needs to be returned as yx to match the coordinate system as saved in spatialdataobjects
+
     def _post_processing_cleanup(self, vars_to_delete: list | None = None):
         if vars_to_delete is not None:
             self._clear_cache(vars_to_delete=vars_to_delete)
@@ -51,7 +197,6 @@ def _post_processing_cleanup(self, vars_to_delete: list | None = None):
 
     def process(
         self,
-        segmentation_name: str,
         cell_sets: list[dict],
         calibration_marker: np.array,
         name: str | None = None,
@@ -61,9 +206,9 @@ def process(
         Under the hood this method relies on the pylmd library and utilizies its `SegmentationLoader` Class.
 
         Args:
-            segmentation_name (str): Name of the segmentation to be used for shape generation in the sdata object.
             cell_sets (list of dict): List of dictionaries containing the sets of cells which should be sorted into a single well. Mandatory keys for each dictionary are: name, classes. Optional keys are: well.
             calibration_marker (numpy.array): Array of size ‘(3,2)’ containing the calibration marker coordinates in the ‘(row, column)’ format.
+            name (str, optional): Name of the output file. If not provided, the name will be generated based on the names of the cell sets or if also not specified set to "selected_cells".
 
         Example:
 
@@ -77,7 +222,6 @@ def process(
                 # A numpy Array of shape (3, 2) should be passed.
                 calibration_marker = np.array([marker_0, marker_1, marker_2])
 
-
                 # Sets of cells can be defined by providing a name and a list of classes in a dictionary.
                 cells_to_select = [{"name": "dataset1", "classes": [1, 2, 3]}]
 
@@ -122,7 +266,7 @@ def process(
                     convolution_smoothing: 25
 
                     # fold reduction of datapoints for compression
-                    poly_compression_factor: 30
+                    rdp: 0.7
 
                     # Optimization of the cutting path inbetween shapes
                     # optimized paths improve the cutting time and the microscopes focus
@@ -160,32 +304,29 @@ def process(
 
         self._setup_selection()
 
-        ## TO Do
-        # check if classes and seglookup table already exist as pickle file
-        # if not create them
-        # else load them and proceed with selection
-
-        # load segmentation from hdf5
-        self.path_seg_mask = self.filehandler._load_seg_to_memmap(
-            [segmentation_name], tmp_dir_abs_path=self._tmp_dir_path
-        )
+        print("Here", flush=True)
 
-        segmentation = tempmmap.mmap_array_from_path(self.path_seg_mask)
+        start_time = timeit.default_timer()
+        cell_ids = self._get_cell_ids(cell_sets)
+        centers = self._get_centers(cell_ids)
+        coord_index = self._get_coords(cell_ids = cell_ids,
+                                        centers = centers,
+                                        width = self.cell_radius,
+                                        batch_size = self.batch_size,
+                                        threads = self.threads)
+        self.log(f"Coordinate lookup index calculation took {timeit.default_timer() - start_time} seconds.")
 
-        # create segmentation loader
         sl = SegmentationLoader(
             config=self.config,
             verbose=self.debug,
             processes=self.config["processes_cell_sets"],
         )
 
-        if len(segmentation.shape) == 3:
-            segmentation = np.squeeze(segmentation)
-        else:
-            raise ValueError(f"Segmentation shape is not correct. Expected 2D array, got {segmentation.shape}")
+        shape_collection = sl(None,
+                              self.cell_sets,
+                              self.calibration_marker,
+                              coords_lookup=coord_index)
 
-        # get shape collections
-        shape_collection = sl(segmentation, self.cell_sets, self.calibration_marker)
 
         if self.debug:
             shape_collection.plot(calibration=True)
@@ -196,4 +337,4 @@ def process(
         self.log(f"Saved output at {self.savepath}")
 
         # perform post processing cleanup
-        self._post_processing_cleanup(vars_to_delete=[shape_collection, sl, segmentation])
+        self._post_processing_cleanup(vars_to_delete=[shape_collection, sl, coord_index])

From bf3a793792ffb963c9d079019ca8ee7a7b1efabf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Sat, 25 Jan 2025 20:12:44 +0100
Subject: [PATCH 28/56] fix remove deprecated parameter

---
 src/scportrait/pipeline/project.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/scportrait/pipeline/project.py b/src/scportrait/pipeline/project.py
index 864195d0..caa76f2f 100644
--- a/src/scportrait/pipeline/project.py
+++ b/src/scportrait/pipeline/project.py
@@ -1080,7 +1080,6 @@ def select(
         self,
         cell_sets: list[dict],
         calibration_marker: np.ndarray | None = None,
-        segmentation_name: str = "seg_all_nucleus",
         name: str | None = None,
     ):
         """
@@ -1096,10 +1095,8 @@ def select(
             raise ValueError("No nucleus or cytosol segmentation loaded. Please load a segmentation first.")
 
         assert self.sdata is not None, "No sdata object loaded."
-        assert segmentation_name in self.sdata.labels, f"Segmentation {segmentation_name} not found in sdata object."
 
         self.selection_f(
-            segmentation_name=segmentation_name,
             cell_sets=cell_sets,
             calibration_marker=calibration_marker,
             name=name,

From c475a6b429d203e1aedab36663f66ef40d364b07 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Sat, 25 Jan 2025 20:13:59 +0100
Subject: [PATCH 29/56] fix incorrect check for edge pixels in image crop

---
 src/scportrait/pipeline/selection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/scportrait/pipeline/selection.py b/src/scportrait/pipeline/selection.py
index cc26c30c..8e4c2123 100644
--- a/src/scportrait/pipeline/selection.py
+++ b/src/scportrait/pipeline/selection.py
@@ -121,7 +121,7 @@ def __get_coords(self,
 
             sparse = coo_array(_cropped == _id)
 
-            if 0 in sparse:
+            if 0 in sparse.coords[0] or 0 in sparse.coords[1] or width*2 - 1 in sparse.coords[0] or width*2 - 1 in sparse.coords[1]:
                 Warning(f"Cell {i} with id {_id} is potentially not fully contained in the bounding mask. Consider increasing the value for the 'cell_width' parameter in your config.")
 
             x = sparse.coords[0] + x_start

From 555323858fbb7c792c47faabf2ece5c303c04b2b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Sat, 25 Jan 2025 20:16:01 +0100
Subject: [PATCH 30/56] fix remove debugging statement

---
 src/scportrait/pipeline/selection.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/scportrait/pipeline/selection.py b/src/scportrait/pipeline/selection.py
index 8e4c2123..997e33a4 100644
--- a/src/scportrait/pipeline/selection.py
+++ b/src/scportrait/pipeline/selection.py
@@ -304,8 +304,6 @@ def process(
 
         self._setup_selection()
 
-        print("Here", flush=True)
-
         start_time = timeit.default_timer()
         cell_ids = self._get_cell_ids(cell_sets)
         centers = self._get_centers(cell_ids)

From 1af6fe1ffae7daf1606992601b431f3acd588c7f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Sat, 25 Jan 2025 20:33:52 +0100
Subject: [PATCH 31/56] ensure uptodate py-lmd version

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index de7f4aa0..a143389b 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -35,4 +35,4 @@ pyqt5
 lxml_html_clean
 ashlar>=1.19.0
 networkx
-py-lmd
+py-lmd>=1.3.0

From 733c58a07ff82ed19cc004c14fe8e74eb8394011 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Sat, 25 Jan 2025 20:34:29 +0100
Subject: [PATCH 32/56] ensure most uptodate py-lmd version

---
 requirements_dev.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements_dev.txt b/requirements_dev.txt
index e705f1ff..8d807d29 100644
--- a/requirements_dev.txt
+++ b/requirements_dev.txt
@@ -35,7 +35,7 @@ pyqt5
 lxml_html_clean
 ashlar>=1.19.0
 networkx
-py-lmd
+py-lmd>=1.3.0
 
 #packages for building the documentation
 sphinx

From cdcf9bb1cbcf2cb01e1e0f788e644e465c66a40b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Sat, 25 Jan 2025 20:38:24 +0100
Subject: [PATCH 33/56] ruff linting

---
 src/scportrait/pipeline/selection.py   | 122 +++++++++++++------------
 src/scportrait/tools/stitch/_stitch.py |   4 +-
 2 files changed, 65 insertions(+), 61 deletions(-)

diff --git a/src/scportrait/pipeline/selection.py b/src/scportrait/pipeline/selection.py
index 997e33a4..a3c10746 100644
--- a/src/scportrait/pipeline/selection.py
+++ b/src/scportrait/pipeline/selection.py
@@ -1,22 +1,21 @@
+import multiprocessing as mp
 import os
+import pickle
+import timeit
+from functools import partial as func_partial
 
+import h5py
+import matplotlib.pyplot as plt
 import numpy as np
+import pandas as pd
 from alphabase.io import tempmmap
 from lmd.lib import SegmentationLoader
-
-import h5py
-import timeit
-import pandas as pd
-import pickle
 from scipy.sparse import coo_array
 from tqdm.auto import tqdm
-from functools import partial as func_partial
-import multiprocessing as mp
 
 from scportrait.pipeline._base import ProcessingStep
 from scportrait.pipeline._utils.helper import flatten
 
-import matplotlib.pyplot as plt
 
 class LMDSelection(ProcessingStep):
     """
@@ -32,7 +31,7 @@ def __init__(self, *args, **kwargs):
         self.cell_sets = None
         self.calibration_marker = None
 
-        self.deep_debug = False #flag for deep debugging by developers
+        self.deep_debug = False  # flag for deep debugging by developers
 
     def _check_config(self):
         assert "segmentation_channel" in self.config, "segmentation_channel not defined in config"
@@ -40,8 +39,8 @@ def _check_config(self):
 
         # check for optional config parameters
 
-        #this defines how large the box mask around the center of a cell is for the coordinate extraction
-        #assumption is that all pixels belonging to each mask are within the box otherwise they will be cut off during cutting contour generation
+        # this defines how large the box mask around the center of a cell is for the coordinate extraction
+        # assumption is that all pixels belonging to each mask are within the box otherwise they will be cut off during cutting contour generation
 
         if "cell_width" in self.config:
             self.cell_radius = self.config["cell_width"]
@@ -66,7 +65,9 @@ def _check_config(self):
             self.orientation_transform = self.config["orientation_transform"]
         else:
             self.orientation_transform = np.array([[0, -1], [1, 0]])
-            self.config["orientation_transform"] = self.orientation_transform #ensure its also in config so its passed on to the segmentation loader
+            self.config["orientation_transform"] = (
+                self.orientation_transform
+            )  # ensure its also in config so its passed on to the segmentation loader
 
         if "processes_cell_sets" in self.config:
             self.processes_cell_sets = self.config["processes_cell_sets"]
@@ -76,7 +77,6 @@ def _check_config(self):
             self.processes_cell_sets = 1
 
     def _setup_selection(self):
-
         # configure name of extraction
         if self.name is None:
             try:
@@ -89,13 +89,14 @@ def _setup_selection(self):
         savename = name.replace(" ", "_") + ".xml"
         self.savepath = os.path.join(self.directory, savename)
 
-        #check that the segmentation label exists
-        assert self.segmentation_channel_to_select in self.project.filehandler.get_sdata()._shared_keys, f"Segmentation channel {self.segmentation_channel_to_select} not found in sdata."
+        # check that the segmentation label exists
+        assert (
+            self.segmentation_channel_to_select in self.project.filehandler.get_sdata()._shared_keys
+        ), f"Segmentation channel {self.segmentation_channel_to_select} not found in sdata."
 
-    def __get_coords(self,
-                     cell_ids: list,
-                     centers:list[tuple[int, int]],
-                     width:int = 60) -> list[tuple[int, np.ndarray]]:
+    def __get_coords(
+        self, cell_ids: list, centers: list[tuple[int, int]], width: int = 60
+    ) -> list[tuple[int, np.ndarray]]:
         results = []
 
         _sdata = self.project.filehandler.get_sdata()
@@ -105,12 +106,14 @@ def __get_coords(self,
             x_start = np.max([int(values[0]) - width, 0])
             y_start = np.max([int(values[1]) - width, 0])
 
-            x_end = x_start + width*2
-            y_end = y_start + width*2
+            x_end = x_start + width * 2
+            y_end = y_start + width * 2
 
-            _cropped = _sdata[self.segmentation_channel_to_select][slice(x_start, x_end), slice(y_start, y_end)].compute()
+            _cropped = _sdata[self.segmentation_channel_to_select][
+                slice(x_start, x_end), slice(y_start, y_end)
+            ].compute()
 
-            #optional plotting output for deep debugging
+            # optional plotting output for deep debugging
             if self.deep_debug:
                 if self.threads == 1:
                     plt.figure()
@@ -121,44 +124,48 @@ def __get_coords(self,
 
             sparse = coo_array(_cropped == _id)
 
-            if 0 in sparse.coords[0] or 0 in sparse.coords[1] or width*2 - 1 in sparse.coords[0] or width*2 - 1 in sparse.coords[1]:
-                Warning(f"Cell {i} with id {_id} is potentially not fully contained in the bounding mask. Consider increasing the value for the 'cell_width' parameter in your config.")
+            if (
+                0 in sparse.coords[0]
+                or 0 in sparse.coords[1]
+                or width * 2 - 1 in sparse.coords[0]
+                or width * 2 - 1 in sparse.coords[1]
+            ):
+                Warning(
+                    f"Cell {i} with id {_id} is potentially not fully contained in the bounding mask. Consider increasing the value for the 'cell_width' parameter in your config."
+                )
 
             x = sparse.coords[0] + x_start
             y = sparse.coords[1] + y_start
 
-            results.append((_id, np.array(list(zip(x, y, strict = True)))))
+            results.append((_id, np.array(list(zip(x, y, strict=True)))))
 
-        return(results)
+        return results
 
-    def _get_coords_multi(self, width:int, arg: tuple[list[int], np.ndarray]) -> list[tuple[int, np.ndarray]]:
+    def _get_coords_multi(self, width: int, arg: tuple[list[int], np.ndarray]) -> list[tuple[int, np.ndarray]]:
         cell_ids, centers = arg
         results = self.__get_coords(cell_ids, centers, width)
-        return(results)
-
-    def _get_coords(self,
-                    cell_ids: list,
-                    centers:list[tuple[int, int]],
-                    width:int = 60,
-                    batch_size:int = 100,
-                    threads:int = 10) -> dict:
+        return results
 
-        #create batches
-        n_batches = int(np.ceil(len(cell_ids)/batch_size))
-        slices = [(i*batch_size, i*batch_size + batch_size) for i in range(n_batches - 1)]
-        slices.append(((n_batches - 1)*batch_size, len(cell_ids)))
+    def _get_coords(
+        self, cell_ids: list, centers: list[tuple[int, int]], width: int = 60, batch_size: int = 100, threads: int = 10
+    ) -> dict:
+        # create batches
+        n_batches = int(np.ceil(len(cell_ids) / batch_size))
+        slices = [(i * batch_size, i * batch_size + batch_size) for i in range(n_batches - 1)]
+        slices.append(((n_batches - 1) * batch_size, len(cell_ids)))
 
         batched_args = [(cell_ids[start:end], centers[start:end]) for start, end in slices]
 
-        f = func_partial(self._get_coords_multi,
-                        width
-            )
+        f = func_partial(self._get_coords_multi, width)
 
-        if threads == 1: # if only one thread is used, the function is called directly to avoid the overhead of multiprocessing
+        if (
+            threads == 1
+        ):  # if only one thread is used, the function is called directly to avoid the overhead of multiprocessing
             results = [f(arg) for arg in batched_args]
         else:
-            with mp.get_context(self.context).Pool(processes=threads) as pool: 
-                results = list(tqdm(
+            with mp.get_context(self.context).Pool(processes=threads) as pool:
+                results = list(
+                    tqdm(
                         pool.imap(f, batched_args),
                         total=len(batched_args),
                         desc="Processing cell batches",
@@ -168,7 +175,7 @@ def _get_coords(self,
                 pool.join()
 
         results = flatten(results)
-        return(dict(results))
+        return dict(results)
 
     def _get_cell_ids(self, cell_sets: list[dict]) -> list[int]:
         cell_ids = []
@@ -177,13 +184,15 @@ def _get_cell_ids(self, cell_sets: list[dict]) -> list[int]:
                 cell_ids.extend(cell_set["classes"])
             else:
                 Warning(f"Cell set {cell_set['name']} does not contain any classes.")
-        return(cell_ids)
+        return cell_ids
 
     def _get_centers(self, cell_ids: list[int]) -> list[tuple[int, int]]:
         _sdata = self.project.filehandler.get_sdata()
         centers = _sdata["centers_cells"].compute()
         centers = centers.loc[cell_ids, :]
-        return(centers[["y", "x"]].values.tolist()) #needs to be returned as yx to match the coordinate system as saved in spatialdataobjects
+        return centers[
+            ["y", "x"]
+        ].values.tolist()  # needs to be returned as yx to match the coordinate system as saved in spatialdataobjects
 
     def _post_processing_cleanup(self, vars_to_delete: list | None = None):
         if vars_to_delete is not None:
@@ -307,11 +316,10 @@ def process(
         start_time = timeit.default_timer()
         cell_ids = self._get_cell_ids(cell_sets)
         centers = self._get_centers(cell_ids)
-        coord_index = self._get_coords(cell_ids = cell_ids,
-                                        centers = centers,
-                                        width = self.cell_radius,
-                                        batch_size = self.batch_size,
-                                        threads = self.threads)
+        print("Here", flush=True)
+        coord_index = self._get_coords(
+            cell_ids=cell_ids, centers=centers, width=self.cell_radius, batch_size=self.batch_size, threads=self.threads
+        )
         self.log(f"Coordinate lookup index calculation took {timeit.default_timer() - start_time} seconds.")
 
         sl = SegmentationLoader(
@@ -320,11 +328,7 @@ def process(
             processes=self.config["processes_cell_sets"],
         )
 
-        shape_collection = sl(None,
-                              self.cell_sets,
-                              self.calibration_marker,
-                              coords_lookup=coord_index)
-
+        shape_collection = sl(None, self.cell_sets, self.calibration_marker, coords_lookup=coord_index)
 
         if self.debug:
             shape_collection.plot(calibration=True)
diff --git a/src/scportrait/tools/stitch/_stitch.py b/src/scportrait/tools/stitch/_stitch.py
index 90ead29f..01938a0f 100644
--- a/src/scportrait/tools/stitch/_stitch.py
+++ b/src/scportrait/tools/stitch/_stitch.py
@@ -23,6 +23,7 @@
 from scportrait.tools.stitch._utils.ashlar_plotting import plot_edge_quality, plot_edge_scatter
 from scportrait.tools.stitch._utils.filewriters import write_ome_zarr, write_spatialdata, write_tif, write_xml
 
+
 class Stitcher:
     """
     Class for stitching of image tiles to assemble a mosaic.
@@ -161,7 +162,6 @@ def _lazy_imports(self):
             BioformatsReaderRescale,
             FilePatternReaderRescale,
         )
-
         from scportrait.tools.stitch._utils.parallelized_ashlar import ParallelEdgeAligner, ParallelMosaic
 
         self.ashlar_thumbnail = thumbnail
@@ -655,7 +655,7 @@ def _initialize_aligner(self):
         return aligner
 
     def _initialize_mosaic(self):
-        mosaic =self.ParallelMosaic(
+        mosaic = self.ParallelMosaic(
             self.aligner, self.aligner.mosaic_shape, verbose=True, channels=self.channels, n_threads=self.threads
         )
         return mosaic

From d1d56dd0ae864411d1e523e5129d605102db31c7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Sat, 25 Jan 2025 21:00:10 +0100
Subject: [PATCH 34/56] fix ruff issues

---
 src/scportrait/pipeline/_utils/helper.py | 2 +-
 src/scportrait/pipeline/selection.py     | 6 +++---
 src/scportrait/tools/stitch/_stitch.py   | 9 ++++-----
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/src/scportrait/pipeline/_utils/helper.py b/src/scportrait/pipeline/_utils/helper.py
index 9b11fa2a..9b2b4fed 100644
--- a/src/scportrait/pipeline/_utils/helper.py
+++ b/src/scportrait/pipeline/_utils/helper.py
@@ -3,7 +3,7 @@
 T = TypeVar("T")
 
 
-def flatten(nested_list: list[list[T]]) -> list[T]:
+def flatten(nested_list: list[list[T]]) -> list[T | tuple[T]]:
     """Flatten a list of lists into a single list.
 
     Args:
diff --git a/src/scportrait/pipeline/selection.py b/src/scportrait/pipeline/selection.py
index a3c10746..87b10e81 100644
--- a/src/scportrait/pipeline/selection.py
+++ b/src/scportrait/pipeline/selection.py
@@ -148,7 +148,7 @@ def _get_coords_multi(self, width: int, arg: tuple[list[int], np.ndarray]) -> li
 
     def _get_coords(
         self, cell_ids: list, centers: list[tuple[int, int]], width: int = 60, batch_size: int = 100, threads: int = 10
-    ) -> dict:
+    ) -> dict[int, np.ndarray]:
         # create batches
         n_batches = int(np.ceil(len(cell_ids) / batch_size))
         slices = [(i * batch_size, i * batch_size + batch_size) for i in range(n_batches - 1)]
@@ -174,8 +174,8 @@ def _get_coords(
                 pool.close()
                 pool.join()
 
-        results = flatten(results)
-        return dict(results)
+        results = flatten(results)  # type: ignore
+        return dict(results)  # type: ignore
 
     def _get_cell_ids(self, cell_sets: list[dict]) -> list[int]:
         cell_ids = []
diff --git a/src/scportrait/tools/stitch/_stitch.py b/src/scportrait/tools/stitch/_stitch.py
index 01938a0f..00efc14c 100644
--- a/src/scportrait/tools/stitch/_stitch.py
+++ b/src/scportrait/tools/stitch/_stitch.py
@@ -23,7 +23,6 @@
 from scportrait.tools.stitch._utils.ashlar_plotting import plot_edge_quality, plot_edge_scatter
 from scportrait.tools.stitch._utils.filewriters import write_ome_zarr, write_spatialdata, write_tif, write_xml
 
-
 class Stitcher:
     """
     Class for stitching of image tiles to assemble a mosaic.
@@ -107,10 +106,6 @@ def __init__(
         """
         self._lazy_imports()
 
-        # workaround for lazy imports of module
-        if self.reader_type == "FilePatternReaderRescale":
-            self.reader_type = self.FilePatternReaderRescale
-
         if orientation is None:
             orientation = {"flip_x": False, "flip_y": True}
 
@@ -139,6 +134,10 @@ def __init__(
         self.orientation = orientation
         self.reader_type = reader_type
 
+        # workaround for lazy imports of module
+        if self.reader_type == "FilePatternReaderRescale":
+            self.reader_type = self.FilePatternReaderRescale
+
         # workflow setup
         self.plot_QC = plot_QC
         self.overwrite = overwrite

From 57a572b204d6ca6f8eaaac786ab8a461e760958f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Sat, 25 Jan 2025 21:00:27 +0100
Subject: [PATCH 35/56] ruff linting

---
 src/scportrait/tools/stitch/_stitch.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/scportrait/tools/stitch/_stitch.py b/src/scportrait/tools/stitch/_stitch.py
index 00efc14c..e1020d64 100644
--- a/src/scportrait/tools/stitch/_stitch.py
+++ b/src/scportrait/tools/stitch/_stitch.py
@@ -23,6 +23,7 @@
 from scportrait.tools.stitch._utils.ashlar_plotting import plot_edge_quality, plot_edge_scatter
 from scportrait.tools.stitch._utils.filewriters import write_ome_zarr, write_spatialdata, write_tif, write_xml
 
+
 class Stitcher:
     """
     Class for stitching of image tiles to assemble a mosaic.

From e569264046f43a820a732b8532bbddedb69ff866 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Sun, 26 Jan 2025 22:53:49 +0100
Subject: [PATCH 36/56] add helper function to read config files

---
 src/scportrait/pipeline/_utils/helper.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/scportrait/pipeline/_utils/helper.py b/src/scportrait/pipeline/_utils/helper.py
index 9b2b4fed..1f6c0123 100644
--- a/src/scportrait/pipeline/_utils/helper.py
+++ b/src/scportrait/pipeline/_utils/helper.py
@@ -1,7 +1,15 @@
 from typing import TypeVar
+import yaml
 
 T = TypeVar("T")
 
+def read_config(config_path: str) -> dict:
+    with open(config_path) as stream:
+        try:
+            config = yaml.safe_load(stream)
+        except yaml.YAMLError as exc:
+            print(exc)
+    return config
 
 def flatten(nested_list: list[list[T]]) -> list[T | tuple[T]]:
     """Flatten a list of lists into a single list.

From f4c73bf892e97ce92cb98ddc5f1a333a6f294153 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Sun, 26 Jan 2025 22:54:08 +0100
Subject: [PATCH 37/56] if config is passed as a string automatically read

---
 src/scportrait/pipeline/_base.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/scportrait/pipeline/_base.py b/src/scportrait/pipeline/_base.py
index fdca9b64..12ff9ae0 100644
--- a/src/scportrait/pipeline/_base.py
+++ b/src/scportrait/pipeline/_base.py
@@ -9,6 +9,7 @@
 import numpy as np
 import torch
 
+from scportrait.pipeline._utils.helper import read_config
 
 class Logable:
     """Create log entries.
@@ -183,7 +184,11 @@ def __init__(
         self.debug = debug
         self.overwrite = overwrite
         self.project_location = project_location
-        self.config = config
+
+        if isinstance(config, str):
+            self.config = read_config(config)
+        else:
+            self.config = config
         self.overwrite = overwrite
 
         self.project = project

From 9873f6f62dae3895c771d7143735253c6f797d88 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Sun, 26 Jan 2025 22:54:26 +0100
Subject: [PATCH 38/56] utilize new read_config function

---
 src/scportrait/pipeline/project.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/src/scportrait/pipeline/project.py b/src/scportrait/pipeline/project.py
index caa76f2f..ebd44192 100644
--- a/src/scportrait/pipeline/project.py
+++ b/src/scportrait/pipeline/project.py
@@ -23,7 +23,6 @@
 import numpy as np
 import psutil
 import xarray
-import yaml
 from alphabase.io import tempmmap
 from napari_spatialdata import Interactive
 from ome_zarr.io import parse_url
@@ -33,6 +32,7 @@
 
 from scportrait.io import daskmmap
 from scportrait.pipeline._base import Logable
+from scportrait.pipeline._utils.helper import read_config
 from scportrait.pipeline._utils.sdata_io import sdata_filehandler
 from scportrait.pipeline._utils.spatialdata_helper import (
     calculate_centroids,
@@ -94,7 +94,7 @@ class Project(Logable):
     def __init__(
         self,
         project_location: str,
-        config_path: str,
+        config_path: str = None,
         segmentation_f=None,
         extraction_f=None,
         featurization_f=None,
@@ -185,11 +185,7 @@ def _load_config_from_file(self, file_path):
         if not os.path.isfile(file_path):
             raise ValueError(f"Your config path {file_path} is invalid.")
 
-        with open(file_path) as stream:
-            try:
-                self.config = yaml.safe_load(stream)
-            except yaml.YAMLError as exc:
-                print(exc)
+        self.config = read_config(file_path)
 
     def _get_config_file(self, config_path: str | None = None) -> None:
         """Load the config file for the project. If no config file is passed the default config file in the project directory is loaded.

From 7e04994faab886388d7ddf4d3b5985a5cdf7a8ef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Sun, 26 Jan 2025 22:54:48 +0100
Subject: [PATCH 39/56] turn off overwriting for classification directory

---
 src/scportrait/pipeline/project.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/scportrait/pipeline/project.py b/src/scportrait/pipeline/project.py
index ebd44192..6f597aa3 100644
--- a/src/scportrait/pipeline/project.py
+++ b/src/scportrait/pipeline/project.py
@@ -305,7 +305,7 @@ def _setup_featurization_f(self, featurization_f):
                 self.featurization_directory,
                 project_location=self.project_location,
                 debug=self.debug,
-                overwrite=self.overwrite,
+                overwrite=False,  #this needs to be set to false as the featurization step should not remove previously created features
                 project=self,
                 filehandler=self.filehandler,
             )
@@ -1063,6 +1063,8 @@ def featurize(
         # setup overwrite if specified in call
         if overwrite is not None:
             self.featurization_f.overwrite_run_path = overwrite
+        if overwrite is None:
+            self.featurization_f.overwrite_run_path = True
 
         # update the number of masks that are available in the segmentation object
         self.featurization_f.n_masks = sum([self.nuc_seg_status, self.cyto_seg_status])

From 4b4def2a716ec0ec1e2bfcf98be8710f3cd701e5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Sun, 26 Jan 2025 22:59:10 +0100
Subject: [PATCH 40/56] fix reading of config

if method is a key in config ensure that only the config parameters relevant to that method are read
---
 src/scportrait/pipeline/_base.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/scportrait/pipeline/_base.py b/src/scportrait/pipeline/_base.py
index 12ff9ae0..6a67c180 100644
--- a/src/scportrait/pipeline/_base.py
+++ b/src/scportrait/pipeline/_base.py
@@ -186,7 +186,11 @@ def __init__(
         self.project_location = project_location
 
         if isinstance(config, str):
-            self.config = read_config(config)
+            config = read_config(config)
+            if self.__class__.__name__ in config.keys():
+                self.config = config[self.__class__.__name__ ]
+            else:
+                self.config = config
         else:
             self.config = config
         self.overwrite = overwrite

From db03a2370351b8e494b31f8c4a6c5a6f5bc96365 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Sun, 26 Jan 2025 23:03:23 +0100
Subject: [PATCH 41/56] make datatype optional for directory creation

---
 src/scportrait/pipeline/featurization.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/scportrait/pipeline/featurization.py b/src/scportrait/pipeline/featurization.py
index 316b6aaf..39d45dd9 100644
--- a/src/scportrait/pipeline/featurization.py
+++ b/src/scportrait/pipeline/featurization.py
@@ -36,6 +36,7 @@ def __init__(self, *args, **kwargs):
         self.model = None
         self.transforms = None
         self.expected_imagesize = None
+        self.data_type = None
 
         self._setup_channel_selection()
 
@@ -59,7 +60,10 @@ def _setup_output(self):
         if not os.path.isdir(self.directory):
             os.makedirs(self.directory)
 
-        self.run_path = os.path.join(self.directory, f"{self.data_type}_{self.label}")
+        if self.data_type is None:
+            self.run_path = os.path.join(self.directory, self.label)
+        else:
+            self.run_path = os.path.join(self.directory, f"{self.data_type}_{self.label}")
 
         if not os.path.isdir(self.run_path):
             os.makedirs(self.run_path)

From e72c3366ed6c842a769b7f671edbcae77c9c7a94 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Mon, 27 Jan 2025 01:51:26 +0100
Subject: [PATCH 42/56] add from_project parameter

start implementing support for running methods outside of the scportrait project structure
---
 src/scportrait/pipeline/_base.py   | 24 +++++++++++++++++++-----
 src/scportrait/pipeline/project.py |  4 ++++
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/src/scportrait/pipeline/_base.py b/src/scportrait/pipeline/_base.py
index 6a67c180..fe07217e 100644
--- a/src/scportrait/pipeline/_base.py
+++ b/src/scportrait/pipeline/_base.py
@@ -177,13 +177,30 @@ class ProcessingStep(Logable):
     DEFAULT_SELECTION_DIR_NAME = "selection"
 
     def __init__(
-        self, config, directory, project_location, debug=False, overwrite=False, project=None, filehandler=None
+        self,
+        config,
+        directory = None,
+        project_location = None,
+        debug=False,
+        overwrite=False,
+        project=None,
+        filehandler=None,
+        from_project:bool = False,
     ):
         super().__init__(directory=directory)
 
         self.debug = debug
         self.overwrite = overwrite
-        self.project_location = project_location
+        if from_project:
+            self.project_run = True
+            self.project_location = project_location
+            self.project = project
+            self.filehandler = filehandler
+        else:
+            self.project_run = False
+            self.project_location = None
+            self.project = None
+            self.filehandler = None
 
         if isinstance(config, str):
             config = read_config(config)
@@ -195,9 +212,6 @@ def __init__(
             self.config = config
         self.overwrite = overwrite
 
-        self.project = project
-        self.filehandler = filehandler
-
         self.get_context()
 
         self.deep_debug = False
diff --git a/src/scportrait/pipeline/project.py b/src/scportrait/pipeline/project.py
index 6f597aa3..138344d9 100644
--- a/src/scportrait/pipeline/project.py
+++ b/src/scportrait/pipeline/project.py
@@ -253,6 +253,7 @@ def _setup_segmentation_f(self, segmentation_f):
                 overwrite=self.overwrite,
                 project=None,
                 filehandler=self.filehandler,
+                from_project=True,
             )
 
     def _setup_extraction_f(self, extraction_f):
@@ -281,6 +282,7 @@ def _setup_extraction_f(self, extraction_f):
                 overwrite=self.overwrite,
                 project=self,
                 filehandler=self.filehandler,
+                from_project=True,
             )
 
     def _setup_featurization_f(self, featurization_f):
@@ -308,6 +310,7 @@ def _setup_featurization_f(self, featurization_f):
                 overwrite=False,  #this needs to be set to false as the featurization step should not remove previously created features
                 project=self,
                 filehandler=self.filehandler,
+                from_project=True,
             )
 
     def _setup_selection(self, selection_f):
@@ -335,6 +338,7 @@ def _setup_selection(self, selection_f):
                 overwrite=self.overwrite,
                 project=self,
                 filehandler=self.filehandler,
+                from_project=True,
             )
 
     def update_featurization_f(self, featurization_f):

From 07098c40e941d95fb1202577793726b2a4d6eecd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Mon, 27 Jan 2025 01:51:39 +0100
Subject: [PATCH 43/56] save n_masks to single-cell dataset

---
 src/scportrait/pipeline/extraction.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/scportrait/pipeline/extraction.py b/src/scportrait/pipeline/extraction.py
index 505f6081..20bfbb75 100644
--- a/src/scportrait/pipeline/extraction.py
+++ b/src/scportrait/pipeline/extraction.py
@@ -732,6 +732,12 @@ def _transfer_tempmmap_to_hdf5(self):
                 dtype=h5py.special_dtype(vlen=str),
             )
 
+            hf.create_dataset(
+                "n_masks",
+                data=self.n_masks,
+                dtype=int,
+            )
+
             self.log("channel information created.")
 
         # cleanup memory

From cde2998ff46cdfa3100486c8065d5dbf0d086a2d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Mon, 27 Jan 2025 01:51:53 +0100
Subject: [PATCH 44/56] read masks from hdf5 if not already provided

---
 src/scportrait/pipeline/featurization.py | 129 +++++++++++++++--------
 1 file changed, 85 insertions(+), 44 deletions(-)

diff --git a/src/scportrait/pipeline/featurization.py b/src/scportrait/pipeline/featurization.py
index 39d45dd9..32089592 100644
--- a/src/scportrait/pipeline/featurization.py
+++ b/src/scportrait/pipeline/featurization.py
@@ -6,6 +6,7 @@
 from functools import partial as func_partial
 
 import numpy as np
+import h5py
 import pandas as pd
 import pytorch_lightning as pl
 import torch
@@ -17,7 +18,6 @@
 from scportrait.tools.ml.datasets import HDF5SingleCellDataset
 from scportrait.tools.ml.plmodels import MultilabelSupervisedModel
 
-
 class _FeaturizationBase(ProcessingStep):
     PRETRAINED_MODEL_NAMES = [
         "autophagy_classifier",
@@ -170,10 +170,22 @@ def _setup_inference_device(self):
             self.inference_device = self._detect_automatic_inference_device()
             self.log(f"Automatically configured inferece device to {self.inference_device}")
 
-    def _general_setup(self):
+    def _get_nmasks(self):
+        if "n_masks" not in self.__dict__.keys():
+            try:
+                self.n_masks = h5py.File(self.extraction_file, "r")["n_masks"][()].item()
+            except Exception as e:
+                raise ValueError(
+                    f"Could not extract number of masks from HDF5 file. Error: {e}"
+                ) from e
+
+    def _general_setup(self, extraction_dir: str, return_results: bool = False):
         """Helper function to execute all setup functions that are common to all featurization steps."""
 
-        self._setup_output()
+        self.extraction_file = extraction_dir
+        if not return_results:
+            self._setup_output()
+        self._get_nmasks()
         self._setup_log_transform()
         self._setup_inference_device()
 
@@ -784,8 +796,8 @@ def _setup_transforms(self) -> None:
 
         return
 
-    def _setup(self):
-        self._general_setup()
+    def _setup(self, extraction_dir: str, return_results: bool):
+        self._general_setup(extraction_dir=extraction_dir, return_results=return_results)
         self._get_model_specs()
         self._get_network_dir()
 
@@ -803,7 +815,7 @@ def _setup(self):
         self._setup_encoders()
         self._setup_transforms()
 
-    def process(self, extraction_dir: str, size: int = 0):
+    def process(self, extraction_dir: str, size: int = 0, return_results: bool = False):
         """
         Perform classification on the provided HDF5 dataset.
 
@@ -880,7 +892,7 @@ class based on the previous single-cell extraction. Therefore, only the second a
         self.log("Started MLClusterClassifier classification.")
 
         # perform setup
-        self._setup()
+        self._setup(extraction_dir = extraction_dir, return_results=return_results)
 
         self.dataloader = self.generate_dataloader(
             extraction_dir,
@@ -890,21 +902,28 @@ class based on the previous single-cell extraction. Therefore, only the second a
         )
 
         # perform inference
+        all_results = []
         for model in self.models:
             self.log(f"Starting inference for model encoder {model.__name__}")
             results = self.inference(self.dataloader, model)
 
-            output_name = f"inference_{model.__name__}"
-            path = os.path.join(self.run_path, f"{output_name}.csv")
+            if not return_results:
+                output_name = f"inference_{model.__name__}"
+                path = os.path.join(self.run_path, f"{output_name}.csv")
 
-            self._write_results_csv(results, path)
-            self._write_results_sdata(results, label=f"{self.label}_{model.__name__}")
-
-        self.log(f"Results saved to file: {path}")
+                self._write_results_csv(results, path)
+                self._write_results_sdata(results, label=f"{self.label}_{model.__name__}")
+            else:
+                all_results.append(results)
 
-        # perform post processing cleanup
-        if not self.deep_debug:
-            self._post_processing_cleanup()
+        if return_results:
+            self._clear_cache()
+            return all_results
+        else:
+            self.log(f"Results saved to file: {path}")
+            # perform post processing cleanup
+            if not self.deep_debug:
+                self._post_processing_cleanup()
 
 
 class EnsembleClassifier(_FeaturizationBase):
@@ -956,8 +975,8 @@ def _load_models(self):
         memory_usage = self._get_gpu_memory_usage()
         self.log(f"GPU memory usage after loading models: {memory_usage}")
 
-    def _setup(self):
-        self._general_setup()
+    def _setup(self, extraction_dir: str):
+        self._general_set(extraction_dir=extraction_dir)
         self._get_model_specs()
         self._setup_transforms()
 
@@ -969,7 +988,7 @@ def _setup(self):
 
         self._load_models()
 
-    def process(self, extraction_dir, size=0):
+    def process(self, extraction_dir:str, size:int = 0, return_results:bool = False):
         """
         Function called to perform classification on the provided HDF5 dataset.
 
@@ -1024,7 +1043,7 @@ class based on the previous single-cell extraction. Therefore, no parameters nee
 
         self.log("Starting Ensemble Classification")
 
-        self._setup()
+        self._setup(extraction_dir=extraction_dir, return_results=return_results)
 
         self.dataloader = self.generate_dataloader(
             extraction_dir,
@@ -1034,19 +1053,28 @@ class based on the previous single-cell extraction. Therefore, no parameters nee
         )
 
         # perform inference
+        all_results = {}
         for model_name, model in zip(self.model_names, self.model, strict=False):
             self.log(f"Starting inference for model {model_name}")
             results = self.inference(self.dataloader, model)
 
             output_name = f"ensemble_inference_{model_name}"
-            path = os.path.join(self.run_path, f"{output_name}.csv")
 
-            self._write_results_csv(results, path)
-            self._write_results_sdata(results, label=model_name)
+            if not return_results:
+                path = os.path.join(self.run_path, f"{output_name}.csv")
 
-        # perform post processing cleanup
-        if not self.deep_debug:
-            self._post_processing_cleanup()
+                self._write_results_csv(results, path)
+                self._write_results_sdata(results, label=model_name)
+            else:
+                all_results[model_name] = results
+
+        if return_results:
+            self._clear_cache()
+            return all_results
+        else:
+            # perform post processing cleanup
+            if not self.deep_debug:
+                self._post_processing_cleanup()
 
 
 ####### CellFeaturization based on Classic Featurecalculation #######
@@ -1083,10 +1111,19 @@ def _setup_transforms(self):
         return
 
     def _get_channel_specs(self):
-        if "channel_names" in self.project.__dict__.keys():
-            self.channel_names = self.project.channel_names
+        if self.project is None:
+            try:
+                with h5py.File(self.extraction_file, "r") as f:
+                    self.channel_names = list(f["channel_information"][:].astype(str))
+            except Exception as e:
+                raise ValueError(
+                    f"Could not extract channel names from HDF5 file. Please provide channel names manually. Error: {e}"
+                ) from e
         else:
-            self.channel_names = self.project.input_image.c.values
+            if "channel_names" in self.project.__dict__.keys():
+                self.channel_names = self.project.channel_names
+            else:
+                self.channel_names = self.project.input_image.c.values
 
     def _generate_column_names(
         self,
@@ -1298,12 +1335,12 @@ def __init__(self, *args, **kwargs):
 
         self.channel_selection = None  # ensure that all images are passed to the function
 
-    def _setup(self):
-        self._general_setup()
+    def _setup(self, extraction_dir:str, return_results:bool):
+        self._general_setup(extraction_dir=extraction_dir, return_results=return_results)
         self._setup_transforms()
         self._get_channel_specs()
 
-    def process(self, extraction_dir, size=0):
+    def process(self, extraction_dir: str, size: int =0, return_results: bool = False):
         """
         Perform featurization on the provided HDF5 dataset.
 
@@ -1358,7 +1395,7 @@ def process(self, extraction_dir, size=0):
         self.log("Started CellFeaturization of all available channels.")
 
         # perform setup
-        self._setup()
+        self._setup(extraction_dir=extraction_dir, return_results=return_results)
 
         self.dataloader = self.generate_dataloader(
             extraction_dir,
@@ -1388,15 +1425,19 @@ def process(self, extraction_dir, size=0):
             column_names=self.column_names,
         )
 
-        output_name = "calculated_image_features"
-        path = os.path.join(self.run_path, f"{output_name}.csv")
+        if return_results:
+            self._clear_cache()
+            return results
+        else:
+            output_name = "calculated_image_features"
+            path = os.path.join(self.run_path, f"{output_name}.csv")
 
-        self._write_results_csv(results, path)
-        self._write_results_sdata(results)
+            self._write_results_csv(results, path)
+            self._write_results_sdata(results)
 
-        # perform post processing cleanup
-        if not self.deep_debug:
-            self._post_processing_cleanup()
+            # perform post processing cleanup
+            if not self.deep_debug:
+                self._post_processing_cleanup()
 
 
 class CellFeaturizer_single_channel(_cellFeaturizerBase):
@@ -1412,17 +1453,17 @@ def _setup_channel_selection(self):
             self.channel_selection = [0, self.channel_selection]
         return
 
-    def _setup(self):
-        self._general_setup()
+    def _setup(self, extraction_dir:str, return_results:bool):
+        self._general_setup(extraction_dir=extraction_dir, return_results=return_results)
         self._setup_channel_selection()
         self._setup_transforms()
         self._get_channel_specs()
 
-    def process(self, extraction_dir, size=0):
+    def process(self, extraction_dir, size=0, return_results: bool = False):
         self.log(f"Started CellFeaturization of selected channel {self.channel_selection}.")
 
         # perform setup
-        self._setup()
+        self._setup(extraction_dir=extraction_dir, return_results=return_results)
 
         self.dataloader = self.generate_dataloader(
             extraction_dir,

From 72f0c26e906d62193c3512e367a43af4c83162e7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Mon, 27 Jan 2025 01:54:11 +0100
Subject: [PATCH 45/56] fix ruff issues + linting

---
 src/scportrait/pipeline/_base.py         |  9 +++++----
 src/scportrait/pipeline/_utils/helper.py |  3 +++
 src/scportrait/pipeline/featurization.py | 21 ++++++++++-----------
 src/scportrait/pipeline/project.py       |  2 +-
 4 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/src/scportrait/pipeline/_base.py b/src/scportrait/pipeline/_base.py
index fe07217e..d241cdb1 100644
--- a/src/scportrait/pipeline/_base.py
+++ b/src/scportrait/pipeline/_base.py
@@ -11,6 +11,7 @@
 
 from scportrait.pipeline._utils.helper import read_config
 
+
 class Logable:
     """Create log entries.
 
@@ -179,13 +180,13 @@ class ProcessingStep(Logable):
     def __init__(
         self,
         config,
-        directory = None,
-        project_location = None,
+        directory=None,
+        project_location=None,
         debug=False,
         overwrite=False,
         project=None,
         filehandler=None,
-        from_project:bool = False,
+        from_project: bool = False,
     ):
         super().__init__(directory=directory)
 
@@ -205,7 +206,7 @@ def __init__(
         if isinstance(config, str):
             config = read_config(config)
             if self.__class__.__name__ in config.keys():
-                self.config = config[self.__class__.__name__ ]
+                self.config = config[self.__class__.__name__]
             else:
                 self.config = config
         else:
diff --git a/src/scportrait/pipeline/_utils/helper.py b/src/scportrait/pipeline/_utils/helper.py
index 1f6c0123..e9301048 100644
--- a/src/scportrait/pipeline/_utils/helper.py
+++ b/src/scportrait/pipeline/_utils/helper.py
@@ -1,8 +1,10 @@
 from typing import TypeVar
+
 import yaml
 
 T = TypeVar("T")
 
+
 def read_config(config_path: str) -> dict:
     with open(config_path) as stream:
         try:
@@ -11,6 +13,7 @@ def read_config(config_path: str) -> dict:
             print(exc)
     return config
 
+
 def flatten(nested_list: list[list[T]]) -> list[T | tuple[T]]:
     """Flatten a list of lists into a single list.
 
diff --git a/src/scportrait/pipeline/featurization.py b/src/scportrait/pipeline/featurization.py
index 32089592..28124a79 100644
--- a/src/scportrait/pipeline/featurization.py
+++ b/src/scportrait/pipeline/featurization.py
@@ -5,8 +5,8 @@
 from contextlib import redirect_stdout
 from functools import partial as func_partial
 
-import numpy as np
 import h5py
+import numpy as np
 import pandas as pd
 import pytorch_lightning as pl
 import torch
@@ -18,6 +18,7 @@
 from scportrait.tools.ml.datasets import HDF5SingleCellDataset
 from scportrait.tools.ml.plmodels import MultilabelSupervisedModel
 
+
 class _FeaturizationBase(ProcessingStep):
     PRETRAINED_MODEL_NAMES = [
         "autophagy_classifier",
@@ -175,9 +176,7 @@ def _get_nmasks(self):
             try:
                 self.n_masks = h5py.File(self.extraction_file, "r")["n_masks"][()].item()
             except Exception as e:
-                raise ValueError(
-                    f"Could not extract number of masks from HDF5 file. Error: {e}"
-                ) from e
+                raise ValueError(f"Could not extract number of masks from HDF5 file. Error: {e}") from e
 
     def _general_setup(self, extraction_dir: str, return_results: bool = False):
         """Helper function to execute all setup functions that are common to all featurization steps."""
@@ -892,7 +891,7 @@ class based on the previous single-cell extraction. Therefore, only the second a
         self.log("Started MLClusterClassifier classification.")
 
         # perform setup
-        self._setup(extraction_dir = extraction_dir, return_results=return_results)
+        self._setup(extraction_dir=extraction_dir, return_results=return_results)
 
         self.dataloader = self.generate_dataloader(
             extraction_dir,
@@ -975,8 +974,8 @@ def _load_models(self):
         memory_usage = self._get_gpu_memory_usage()
         self.log(f"GPU memory usage after loading models: {memory_usage}")
 
-    def _setup(self, extraction_dir: str):
-        self._general_set(extraction_dir=extraction_dir)
+    def _setup(self, extraction_dir: str, return_results: bool):
+        self._general_setup(extraction_dir=extraction_dir, return_results=return_results)
         self._get_model_specs()
         self._setup_transforms()
 
@@ -988,7 +987,7 @@ def _setup(self, extraction_dir: str):
 
         self._load_models()
 
-    def process(self, extraction_dir:str, size:int = 0, return_results:bool = False):
+    def process(self, extraction_dir: str, size: int = 0, return_results: bool = False):
         """
         Function called to perform classification on the provided HDF5 dataset.
 
@@ -1335,12 +1334,12 @@ def __init__(self, *args, **kwargs):
 
         self.channel_selection = None  # ensure that all images are passed to the function
 
-    def _setup(self, extraction_dir:str, return_results:bool):
+    def _setup(self, extraction_dir: str, return_results: bool):
         self._general_setup(extraction_dir=extraction_dir, return_results=return_results)
         self._setup_transforms()
         self._get_channel_specs()
 
-    def process(self, extraction_dir: str, size: int =0, return_results: bool = False):
+    def process(self, extraction_dir: str, size: int = 0, return_results: bool = False):
         """
         Perform featurization on the provided HDF5 dataset.
 
@@ -1453,7 +1452,7 @@ def _setup_channel_selection(self):
             self.channel_selection = [0, self.channel_selection]
         return
 
-    def _setup(self, extraction_dir:str, return_results:bool):
+    def _setup(self, extraction_dir: str, return_results: bool):
         self._general_setup(extraction_dir=extraction_dir, return_results=return_results)
         self._setup_channel_selection()
         self._setup_transforms()
diff --git a/src/scportrait/pipeline/project.py b/src/scportrait/pipeline/project.py
index 138344d9..90f2bc38 100644
--- a/src/scportrait/pipeline/project.py
+++ b/src/scportrait/pipeline/project.py
@@ -307,7 +307,7 @@ def _setup_featurization_f(self, featurization_f):
                 self.featurization_directory,
                 project_location=self.project_location,
                 debug=self.debug,
-                overwrite=False,  #this needs to be set to false as the featurization step should not remove previously created features
+                overwrite=False,  # this needs to be set to false as the featurization step should not remove previously created features
                 project=self,
                 filehandler=self.filehandler,
                 from_project=True,

From 4d10cfb1ee4b58ead38a3b62afefb9d12e4fa7a0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Mon, 27 Jan 2025 11:15:41 +0100
Subject: [PATCH 46/56] add support for passing multiple HDF5 paths to
 featurizers

---
 src/scportrait/pipeline/featurization.py | 82 ++++++++++++++++++------
 1 file changed, 63 insertions(+), 19 deletions(-)

diff --git a/src/scportrait/pipeline/featurization.py b/src/scportrait/pipeline/featurization.py
index 28124a79..4303a2ba 100644
--- a/src/scportrait/pipeline/featurization.py
+++ b/src/scportrait/pipeline/featurization.py
@@ -109,6 +109,23 @@ def _detect_automatic_inference_device(self):
 
         return inference_device
 
+    def _get_nmasks(self):
+        if "n_masks" not in self.__dict__.keys():
+            if isinstance(self.extraction_file, str):
+                with h5py.File(self.extraction_file, "r") as f:
+                    self.n_masks = f["n_masks"][()].item()
+            if isinstance(self.extraction_file, list):
+                n_masks = []
+                for file in self.extraction_file:
+                    with h5py.File(file, "r") as f:
+                        n_masks.append(f["n_masks"][()].item())
+                assert (x == n_masks[0] for x in n_masks), "number of masks are not consistent over all passed HDF5 files."
+                self.n_masks = n_masks[0]
+            try:
+                self.n_masks = h5py.File(self.extraction_file, "r")["n_masks"][()].item()
+            except Exception as e:
+                raise ValueError(f"Could not extract number of masks from HDF5 file. Error: {e}") from e
+
     def _setup_inference_device(self):
         """
         Configure the featurization run to use the specified inference device.
@@ -171,13 +188,6 @@ def _setup_inference_device(self):
             self.inference_device = self._detect_automatic_inference_device()
             self.log(f"Automatically configured inferece device to {self.inference_device}")
 
-    def _get_nmasks(self):
-        if "n_masks" not in self.__dict__.keys():
-            try:
-                self.n_masks = h5py.File(self.extraction_file, "r")["n_masks"][()].item()
-            except Exception as e:
-                raise ValueError(f"Could not extract number of masks from HDF5 file. Error: {e}") from e
-
     def _general_setup(self, extraction_dir: str, return_results: bool = False):
         """Helper function to execute all setup functions that are common to all featurization steps."""
 
@@ -406,7 +416,8 @@ def configure_transforms(self, selected_transforms: list):
 
     def generate_dataloader(
         self,
-        extraction_dir: str,
+        extraction_dir: str | list[str],
+        labels: int | list[int] = 0,
         selected_transforms: transforms.Compose = transforms.Compose([]),
         size: int = 0,
         seed: int | None = 42,
@@ -443,11 +454,20 @@ def generate_dataloader(
             self.log(f"Expected image size is set to {self.expected_imagesize}. Resizing images to this size.")
             t = transforms.Compose([t, transforms.Resize(self.expected_imagesize)])
 
+        if isinstance(extraction_dir, list):
+            assert isinstance(labels, list), "If multiple directories are provided, multiple labels must be provided."
+            paths = extraction_dir
+            labels = labels
+        elif isinstance(extraction_dir, str):
+            assert isinstance(labels, int), "If only one directory is provided, only one label must be provided."
+            paths = [extraction_dir]
+            labels = [labels]
+
         f = io.StringIO()
         with redirect_stdout(f):
             dataset = dataset_class(
-                dir_list=[extraction_dir],
-                dir_labels=[0],
+                dir_list=paths,
+                dir_labels=labels,
                 transform=t,
                 return_id=True,
                 select_channel=self.channel_selection,
@@ -814,7 +834,11 @@ def _setup(self, extraction_dir: str, return_results: bool):
         self._setup_encoders()
         self._setup_transforms()
 
-    def process(self, extraction_dir: str, size: int = 0, return_results: bool = False):
+    def process(self,
+                extraction_dir: str,
+                labels: int | list[int] = 0,
+                size: int = 0,
+                return_results: bool = False):
         """
         Perform classification on the provided HDF5 dataset.
 
@@ -895,6 +919,7 @@ class based on the previous single-cell extraction. Therefore, only the second a
 
         self.dataloader = self.generate_dataloader(
             extraction_dir,
+            labels = labels,
             selected_transforms=self.transforms,
             size=size,
             dataset_class=self.DEFAULT_DATA_LOADER,
@@ -987,7 +1012,11 @@ def _setup(self, extraction_dir: str, return_results: bool):
 
         self._load_models()
 
-    def process(self, extraction_dir: str, size: int = 0, return_results: bool = False):
+    def process(self,
+                extraction_dir: str,
+                labels: int | list[int] = 0,
+                size: int = 0,
+                return_results: bool = False):
         """
         Function called to perform classification on the provided HDF5 dataset.
 
@@ -1046,6 +1075,7 @@ class based on the previous single-cell extraction. Therefore, no parameters nee
 
         self.dataloader = self.generate_dataloader(
             extraction_dir,
+            labels = labels,
             selected_transforms=self.transforms,
             size=size,
             dataset_class=self.DEFAULT_DATA_LOADER,
@@ -1111,13 +1141,16 @@ def _setup_transforms(self):
 
     def _get_channel_specs(self):
         if self.project is None:
-            try:
+            if isinstance(self.extraction_file, str):
                 with h5py.File(self.extraction_file, "r") as f:
                     self.channel_names = list(f["channel_information"][:].astype(str))
-            except Exception as e:
-                raise ValueError(
-                    f"Could not extract channel names from HDF5 file. Please provide channel names manually. Error: {e}"
-                ) from e
+            if isinstance(self.extraction_file, list):
+                channel_names = []
+                for file in self.extraction_file:
+                    with h5py.File(file, "r") as f:
+                        channel_names.append(list(f["channel_information"][:].astype(str)))
+                assert (x == channel_names[0] for x in channel_names), "Channel names are not consistent over all passed HDF5 files."
+                self.channel_names = channel_names[0]
         else:
             if "channel_names" in self.project.__dict__.keys():
                 self.channel_names = self.project.channel_names
@@ -1339,7 +1372,11 @@ def _setup(self, extraction_dir: str, return_results: bool):
         self._setup_transforms()
         self._get_channel_specs()
 
-    def process(self, extraction_dir: str, size: int = 0, return_results: bool = False):
+    def process(self,
+                extraction_dir: str | list[str],
+                labels: int | list[int] = 0,
+                size: int = 0,
+                return_results: bool = False):
         """
         Perform featurization on the provided HDF5 dataset.
 
@@ -1398,6 +1435,7 @@ def process(self, extraction_dir: str, size: int = 0, return_results: bool = Fal
 
         self.dataloader = self.generate_dataloader(
             extraction_dir,
+            labels = labels,
             selected_transforms=self.transforms,
             size=size,
             dataset_class=self.DEFAULT_DATA_LOADER,
@@ -1458,7 +1496,12 @@ def _setup(self, extraction_dir: str, return_results: bool):
         self._setup_transforms()
         self._get_channel_specs()
 
-    def process(self, extraction_dir, size=0, return_results: bool = False):
+    def process(self,
+                extraction_dir: str | list[str],
+                labels: int | list[int] = 0,
+                size=0,
+                return_results: bool = False):
+
         self.log(f"Started CellFeaturization of selected channel {self.channel_selection}.")
 
         # perform setup
@@ -1466,6 +1509,7 @@ def process(self, extraction_dir, size=0, return_results: bool = False):
 
         self.dataloader = self.generate_dataloader(
             extraction_dir,
+            labels = labels,
             selected_transforms=self.transforms,
             size=size,
             dataset_class=self.DEFAULT_DATA_LOADER,

From d51dc9460b9910086d696a4826a2a6a35d803154 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Mon, 27 Jan 2025 14:58:40 +0100
Subject: [PATCH 47/56] fix incorrect definition of from_project parameter in a
 project run

---
 src/scportrait/pipeline/segmentation/segmentation.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/scportrait/pipeline/segmentation/segmentation.py b/src/scportrait/pipeline/segmentation/segmentation.py
index 63d62490..82e1e8f7 100644
--- a/src/scportrait/pipeline/segmentation/segmentation.py
+++ b/src/scportrait/pipeline/segmentation/segmentation.py
@@ -85,6 +85,7 @@ def __init__(
         overwrite,
         project,
         filehandler,
+        from_project: bool = False,
         **kwargs,
     ):
         super().__init__(
@@ -95,6 +96,7 @@ def __init__(
             overwrite=overwrite,
             project=project,
             filehandler=filehandler,
+            from_project=from_project,
         )
 
         if self.directory is not None:

From 94d4f77ad4bdfbd1bfada00bfb4e9bba37551551 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Mon, 27 Jan 2025 15:01:41 +0100
Subject: [PATCH 48/56] ruff linting + fix typing issues

---
 src/scportrait/pipeline/featurization.py | 51 ++++++++++--------------
 1 file changed, 21 insertions(+), 30 deletions(-)

diff --git a/src/scportrait/pipeline/featurization.py b/src/scportrait/pipeline/featurization.py
index 4303a2ba..9a0cbee5 100644
--- a/src/scportrait/pipeline/featurization.py
+++ b/src/scportrait/pipeline/featurization.py
@@ -119,7 +119,9 @@ def _get_nmasks(self):
                 for file in self.extraction_file:
                     with h5py.File(file, "r") as f:
                         n_masks.append(f["n_masks"][()].item())
-                assert (x == n_masks[0] for x in n_masks), "number of masks are not consistent over all passed HDF5 files."
+                assert (
+                    x == n_masks[0] for x in n_masks
+                ), "number of masks are not consistent over all passed HDF5 files."
                 self.n_masks = n_masks[0]
             try:
                 self.n_masks = h5py.File(self.extraction_file, "r")["n_masks"][()].item()
@@ -188,7 +190,7 @@ def _setup_inference_device(self):
             self.inference_device = self._detect_automatic_inference_device()
             self.log(f"Automatically configured inferece device to {self.inference_device}")
 
-    def _general_setup(self, extraction_dir: str, return_results: bool = False):
+    def _general_setup(self, extraction_dir: str | list[str], return_results: bool = False):
         """Helper function to execute all setup functions that are common to all featurization steps."""
 
         self.extraction_file = extraction_dir
@@ -834,11 +836,7 @@ def _setup(self, extraction_dir: str, return_results: bool):
         self._setup_encoders()
         self._setup_transforms()
 
-    def process(self,
-                extraction_dir: str,
-                labels: int | list[int] = 0,
-                size: int = 0,
-                return_results: bool = False):
+    def process(self, extraction_dir: str, labels: int | list[int] = 0, size: int = 0, return_results: bool = False):
         """
         Perform classification on the provided HDF5 dataset.
 
@@ -919,7 +917,7 @@ class based on the previous single-cell extraction. Therefore, only the second a
 
         self.dataloader = self.generate_dataloader(
             extraction_dir,
-            labels = labels,
+            labels=labels,
             selected_transforms=self.transforms,
             size=size,
             dataset_class=self.DEFAULT_DATA_LOADER,
@@ -1012,11 +1010,7 @@ def _setup(self, extraction_dir: str, return_results: bool):
 
         self._load_models()
 
-    def process(self,
-                extraction_dir: str,
-                labels: int | list[int] = 0,
-                size: int = 0,
-                return_results: bool = False):
+    def process(self, extraction_dir: str, labels: int | list[int] = 0, size: int = 0, return_results: bool = False):
         """
         Function called to perform classification on the provided HDF5 dataset.
 
@@ -1075,7 +1069,7 @@ class based on the previous single-cell extraction. Therefore, no parameters nee
 
         self.dataloader = self.generate_dataloader(
             extraction_dir,
-            labels = labels,
+            labels=labels,
             selected_transforms=self.transforms,
             size=size,
             dataset_class=self.DEFAULT_DATA_LOADER,
@@ -1149,7 +1143,9 @@ def _get_channel_specs(self):
                 for file in self.extraction_file:
                     with h5py.File(file, "r") as f:
                         channel_names.append(list(f["channel_information"][:].astype(str)))
-                assert (x == channel_names[0] for x in channel_names), "Channel names are not consistent over all passed HDF5 files."
+                assert (
+                    x == channel_names[0] for x in channel_names
+                ), "Channel names are not consistent over all passed HDF5 files."
                 self.channel_names = channel_names[0]
         else:
             if "channel_names" in self.project.__dict__.keys():
@@ -1367,16 +1363,14 @@ def __init__(self, *args, **kwargs):
 
         self.channel_selection = None  # ensure that all images are passed to the function
 
-    def _setup(self, extraction_dir: str, return_results: bool):
+    def _setup(self, extraction_dir: str | list[str], return_results: bool):
         self._general_setup(extraction_dir=extraction_dir, return_results=return_results)
         self._setup_transforms()
         self._get_channel_specs()
 
-    def process(self,
-                extraction_dir: str | list[str],
-                labels: int | list[int] = 0,
-                size: int = 0,
-                return_results: bool = False):
+    def process(
+        self, extraction_dir: str | list[str], labels: int | list[int] = 0, size: int = 0, return_results: bool = False
+    ):
         """
         Perform featurization on the provided HDF5 dataset.
 
@@ -1435,7 +1429,7 @@ def process(self,
 
         self.dataloader = self.generate_dataloader(
             extraction_dir,
-            labels = labels,
+            labels=labels,
             selected_transforms=self.transforms,
             size=size,
             dataset_class=self.DEFAULT_DATA_LOADER,
@@ -1490,18 +1484,15 @@ def _setup_channel_selection(self):
             self.channel_selection = [0, self.channel_selection]
         return
 
-    def _setup(self, extraction_dir: str, return_results: bool):
+    def _setup(self, extraction_dir: str | list[str], return_results: bool):
         self._general_setup(extraction_dir=extraction_dir, return_results=return_results)
         self._setup_channel_selection()
         self._setup_transforms()
         self._get_channel_specs()
 
-    def process(self,
-                extraction_dir: str | list[str],
-                labels: int | list[int] = 0,
-                size=0,
-                return_results: bool = False):
-
+    def process(
+        self, extraction_dir: str | list[str], labels: int | list[int] = 0, size=0, return_results: bool = False
+    ):
         self.log(f"Started CellFeaturization of selected channel {self.channel_selection}.")
 
         # perform setup
@@ -1509,7 +1500,7 @@ def process(self,
 
         self.dataloader = self.generate_dataloader(
             extraction_dir,
-            labels = labels,
+            labels=labels,
             selected_transforms=self.transforms,
             size=size,
             dataset_class=self.DEFAULT_DATA_LOADER,

From 06835c5643eea892f56610c531ef21dcf8da99d7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Mon, 27 Jan 2025 15:05:31 +0100
Subject: [PATCH 49/56] fix remove unnecessary print statement

---
 src/scportrait/pipeline/selection.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/scportrait/pipeline/selection.py b/src/scportrait/pipeline/selection.py
index 87b10e81..aa53235c 100644
--- a/src/scportrait/pipeline/selection.py
+++ b/src/scportrait/pipeline/selection.py
@@ -316,7 +316,6 @@ def process(
         start_time = timeit.default_timer()
         cell_ids = self._get_cell_ids(cell_sets)
         centers = self._get_centers(cell_ids)
-        print("Here", flush=True)
         coord_index = self._get_coords(
             cell_ids=cell_ids, centers=centers, width=self.cell_radius, batch_size=self.batch_size, threads=self.threads
         )

From 8547d0fc777a8659279acbda3a81eef62b2a2a5b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Mon, 27 Jan 2025 15:19:14 +0100
Subject: [PATCH 50/56] silences dask warning

addresses #139
---
 src/scportrait/__init__.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/scportrait/__init__.py b/src/scportrait/__init__.py
index 972a0f85..159650a1 100644
--- a/src/scportrait/__init__.py
+++ b/src/scportrait/__init__.py
@@ -1,7 +1,13 @@
 """Top-level package for scPortrait"""
 
+# silence warnings
+import warnings
+
 from scportrait import io
 from scportrait import pipeline as pipeline
 from scportrait import plotting as pl
 from scportrait import processing as pp
 from scportrait import tools as tl
+
+# silence warning from spatialdata resulting in an older dask version see #139
+warnings.filterwarnings("ignore", message="ignoring keyword argument 'read_only'")

From a4373e8a8a04cfb912e4f1b2aee685299f4d871d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Mon, 27 Jan 2025 15:22:02 +0100
Subject: [PATCH 51/56] silence cellpose warning

addresses #141
---
 src/scportrait/__init__.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/scportrait/__init__.py b/src/scportrait/__init__.py
index 159650a1..772f8753 100644
--- a/src/scportrait/__init__.py
+++ b/src/scportrait/__init__.py
@@ -11,3 +11,8 @@
 
 # silence warning from spatialdata resulting in an older dask version see #139
 warnings.filterwarnings("ignore", message="ignoring keyword argument 'read_only'")
+
+# silence warning from cellpose resulting in missing parameter set in model call see #141
+warnings.filterwarnings(
+    "ignore", message=r"You are using `torch.load` with `weights_only=False`.*", category=FutureWarning
+)

From 46b2fc1e415d8b60a8c24c87c13a8dda14c5ec51 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Mon, 27 Jan 2025 17:03:12 +0100
Subject: [PATCH 52/56] update example notebooks

---
 examples/notebooks | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/notebooks b/examples/notebooks
index 5a9b127f..c37c8473 160000
--- a/examples/notebooks
+++ b/examples/notebooks
@@ -1 +1 @@
-Subproject commit 5a9b127f06a39d326931728a0cf9850848fca205
+Subproject commit c37c8473d5a61923185d4a24d76d87c697037cb2

From c2073d1e2f16dc87a337ab5827e87da077d9aa34 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Mon, 27 Jan 2025 18:05:47 +0100
Subject: [PATCH 53/56] fix incorrect logic in check for segmentation masks

---
 src/scportrait/pipeline/project.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/scportrait/pipeline/project.py b/src/scportrait/pipeline/project.py
index 90f2bc38..c219ee3b 100644
--- a/src/scportrait/pipeline/project.py
+++ b/src/scportrait/pipeline/project.py
@@ -1093,7 +1093,7 @@ def select(
 
         self._check_sdata_status()
 
-        if not self.nuc_seg_status or not self.cyto_seg_status:
+        if not self.nuc_seg_status and not self.cyto_seg_status:
             raise ValueError("No nucleus or cytosol segmentation loaded. Please load a segmentation first.")
 
         assert self.sdata is not None, "No sdata object loaded."

From e059a315221195481441659cee3cff7bb8c570dd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Mon, 27 Jan 2025 18:14:50 +0100
Subject: [PATCH 54/56] update notebook submodule commit number

---
 examples/notebooks | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/notebooks b/examples/notebooks
index c37c8473..904a93f3 160000
--- a/examples/notebooks
+++ b/examples/notebooks
@@ -1 +1 @@
-Subproject commit c37c8473d5a61923185d4a24d76d87c697037cb2
+Subproject commit 904a93f389dcb2d5b6ee0c172c48dcc173ca127d

From 864d25dac39b5c9112fd768656d3bf3fb18c252f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Fri, 31 Jan 2025 12:57:49 +0100
Subject: [PATCH 55/56] update py-lmd version to 1.3.1

this fixes a bug in py-lmd, in addition py-lmd 1.3.1 supports numpy<=2.1
---
 requirements.txt     | 2 +-
 requirements_dev.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 1a80991b..59b101aa 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -35,4 +35,4 @@ pyqt5
 lxml_html_clean
 ashlar>=1.19.0
 networkx
-py-lmd>=1.3.0
+py-lmd>=1.3.1
diff --git a/requirements_dev.txt b/requirements_dev.txt
index 57e26bb3..9ff75f16 100644
--- a/requirements_dev.txt
+++ b/requirements_dev.txt
@@ -35,7 +35,7 @@ pyqt5
 lxml_html_clean
 ashlar>=1.19.0
 networkx
-py-lmd>=1.3.0
+py-lmd>=1.3.1
 
 #packages for building the documentation
 sphinx

From 2070779697823abc92cd4f0987c77906f3a1d4be Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sophia=20M=C3=A4dler?=
 <15019107+sophiamaedler@users.noreply.github.com>
Date: Fri, 31 Jan 2025 13:13:31 +0100
Subject: [PATCH 56/56] update submodule commit number

---
 examples/notebooks | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/notebooks b/examples/notebooks
index 904a93f3..d5ea844b 160000
--- a/examples/notebooks
+++ b/examples/notebooks
@@ -1 +1 @@
-Subproject commit 904a93f389dcb2d5b6ee0c172c48dcc173ca127d
+Subproject commit d5ea844b033e18d5fc3c82213c8bfde93465d47f