From 45bdd6502d6814e5f1c63cd09ebf4805f7ab2061 Mon Sep 17 00:00:00 2001
From: Jermiah Joseph <jermiahjoseph98@gmail.com>
Date: Tue, 24 Dec 2024 11:35:53 -0500
Subject: [PATCH 1/6] refactor: enhance image utility functions with type
 annotations and improved documentation

---
 src/imgtools/utils/imageutils.py | 139 +++++++++++++++++++++++--------
 1 file changed, 102 insertions(+), 37 deletions(-)

diff --git a/src/imgtools/utils/imageutils.py b/src/imgtools/utils/imageutils.py
index 33483f5a..844f6b88 100644
--- a/src/imgtools/utils/imageutils.py
+++ b/src/imgtools/utils/imageutils.py
@@ -1,52 +1,117 @@
-import SimpleITK as sitk
+from typing import List, Tuple
+
 import numpy as np
+import SimpleITK as sitk
+
+# Define type aliases for better readability
+Array3D = Tuple[float, float, float]
+ImageArrayMetadata = Tuple[np.ndarray, Array3D, Array3D, Array3D]
+
+
+def image_to_array(image: sitk.Image) -> ImageArrayMetadata:
+	"""
+	Converts a SimpleITK image to a numpy array along with its metadata.
+
+	Parameters
+	----------
+	image : sitk.Image
+		The SimpleITK image to convert.
 
+	Returns
+	-------
+	ImageArrayMetadata
+		A tuple containing:
+		- The image as a numpy array.
+		- The origin of the image (tuple of floats).
+		- The direction cosines of the image (tuple of floats).
+		- The pixel spacing of the image (tuple of floats).
+	"""
+	origin: Array3D = image.GetOrigin()
+	direction: Array3D = image.GetDirection()
+	spacing: Array3D = image.GetSpacing()
+	array: np.ndarray = sitk.GetArrayFromImage(image)
+	return array, origin, direction, spacing
 
-def physical_points_to_idxs(image, points, continuous=False):
-    if continuous:
-        transform = image.TransformPhysicalPointToContinuousIndex
-    else:
-        transform = image.TransformPhysicalPointToIndex
-    
-    vectorized_transform = np.vectorize(lambda x: np.array(transform(x)), signature='(3)->(3)')
-    
-    # transform indices to ContourSequence/ContourData-wise
-    t_points = []
-    for slc in points:
-        t_points.append(vectorized_transform(slc)[:,::-1])
-    return t_points
 
+def physical_points_to_idxs(
+	image: sitk.Image, points: List[np.ndarray], continuous: bool = False
+) -> List[np.ndarray]:
+	"""
+	Converts physical points to image indices based on the reference image's geometry.
 
-def idxs_to_physical_points(image, idxs):
-    continuous = any([isinstance(i, float) for i in idxs])
+	This function uses the geometry of a SimpleITK image (origin, spacing, direction) to convert
+	real-world physical coordinates into indices in the image grid. It optionally supports continuous
+	indices for sub-pixel precision.
 
-    if continuous:
-        transform = image.TransformContinuousIndexToPhysicalPoint
-    else:
-        transform = image.TransformIndexToPhysicalPoint
-    vectorized_transform = np.vectorize(lambda x: np.array(transform(x)), signature='(3)->(3)')
-    return vectorized_transform(idxs)
+	Parameters
+	----------
+	image : sitk.Image
+		The reference SimpleITK image.
+	points : List[np.ndarray]
+		List of 3D physical points to transform.
+	continuous : bool, optional
+		If True, returns continuous indices; otherwise, returns integer indices. Default is False.
 
+	Returns
+	-------
+	List[np.ndarray]
+		A list of transformed points in image index space, reversed to match library conventions.
 
-def image_to_array(image):
-    origin, direction, spacing = image.GetOrigin(), image.GetDirection(), image.GetSpacing()
-    array = sitk.GetArrayFromImage(image)
-    return array, origin, direction, spacing
+	Notes
+	-----
+	The following steps occur within the function:
+	1. A `numpy.vectorize` function is defined to apply the transformation method (physical to index)
+	   to each 3D point in the input array.
+	2. The transformation is applied to each set of points in the list, reversing the coordinate
+	   order to match the library's indexing convention.
+	"""
+	# Select the appropriate transformation function based on the `continuous` parameter.
+	transform = (
+		image.TransformPhysicalPointToContinuousIndex
+		if continuous
+		else image.TransformPhysicalPointToIndex
+	)
 
+	# Step 1: Define a vectorized transformation function
+	# The lambda function takes a single 3D point `x` and:
+	# - Applies the selected transformation (`transform(x)`) to convert it from physical space to index space.
+	# - Wraps the result into a numpy array for further processing.
+	# `np.vectorize` creates a vectorized function that can process arrays of points in one call.
+	# The `signature="(3)->(3)"` ensures the transformation operates on 3D points, returning 3D results.
+	vectorized_transform = np.vectorize(lambda x: np.array(transform(x)), signature='(3)->(3)')
 
-def show_image(image, mask=None, ax=None):
-    import matplotlib.pyplot as plt
-    if ax is None:
-        ax = plt.subplots()
+	# Step 2: Apply the vectorized transformation to all slices of points.
+	# For each 2D array `slc` in the `points` list:
+	# - `vectorized_transform(slc)` applies the transformation to all points in `slc`.
+	# - `[:, ::-1]` reverses the coordinate order (from (x, y, z) to (z, y, x)) to match the library's convention.
+	# The result is stored as a list of numpy arrays (`t_points`), each corresponding to a transformed slice.
+	t_points: List[np.ndarray] = [vectorized_transform(slc)[:, ::-1] for slc in points]
 
-    image_array, *_ = image_to_array(image)
+	# Return the list of transformed points.
+	return t_points
 
-    ax.imshow(image_array, cmap="bone", interpolation="bilinear")
 
-    if mask is not None:
-        mask_array, *_ = image_to_array(mask)
-        mask_array = np.ma.masked_where(mask_array == 0, mask_array)
+def idxs_to_physical_points(image: sitk.Image, idxs: np.ndarray) -> np.ndarray:
+	"""
+	Converts image indices to physical points based on the reference image's geometry.
 
-        ax.imshow(mask_array, cmap="tab20")
+	Parameters
+	----------
+	image : sitk.Image
+		The reference SimpleITK image.
+	idxs : np.ndarray
+		Array of 3D indices (continuous or discrete).
 
-    return ax
+	Returns
+	-------
+	np.ndarray
+		Physical coordinates corresponding to the given indices.
+	"""
+	continuous = np.issubdtype(idxs.dtype, np.floating)
+	transform = (
+		image.TransformContinuousIndexToPhysicalPoint
+		if continuous
+		else image.TransformIndexToPhysicalPoint
+	)
+	vectorized_transform = np.vectorize(lambda x: np.array(transform(x)), signature='(3)->(3)')
+	return vectorized_transform(idxs)

From 4c7ee296f2f1c3ff59e307ae38a57470793ba7d4 Mon Sep 17 00:00:00 2001
From: Jermiah Joseph <jermiahjoseph98@gmail.com>
Date: Tue, 24 Dec 2024 12:38:00 -0500
Subject: [PATCH 2/6] refactor: update mypy and ruff configurations to include
 additional module paths

---
 config/mypy.ini  |  3 ++-
 config/ruff.toml | 11 ++++-------
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/config/mypy.ini b/config/mypy.ini
index 64614b8c..207d1413 100644
--- a/config/mypy.ini
+++ b/config/mypy.ini
@@ -3,7 +3,8 @@
 files = 
     src/imgtools/logging/**/*.py,
     src/imgtools/dicom/**/*.py,
-    src/imgtools/cli/**/*.py
+    src/imgtools/cli/**/*.py,
+    src/imgtools/modules/**/*.py,
 
 # Exclude files from analysis
 exclude = tests,
diff --git a/config/ruff.toml b/config/ruff.toml
index 930ca89d..917c2424 100644
--- a/config/ruff.toml
+++ b/config/ruff.toml
@@ -4,8 +4,8 @@
 # slowly fix everything
 
 include = [
-  "src/imgtools/logging/**/*.py", 
-  # "src/imgtools/cli/**/*.py",
+  "src/imgtools/logging/**/*.py",
+  "src/imgtools/modules/segmentation.py",
   "src/imgtools/dicom/**/*.py",
   # "src/imgtools/utils/crawl.py",
 ]
@@ -15,16 +15,13 @@ extend-exclude = [
   "tests/**/*.py",
   "src/imgtools/ops/ops.py",
   "src/imgtools/io/**/*.py",
-  "src/imgtools/modules/**/*.py",
   "src/imgtools/transforms/**/*.py",
   "src/imgtools/autopipeline.py",
   "src/imgtools/pipeline.py",
   "src/imgtools/image.py",
 ]
 
-extend-include = [
-  "src/imgtools/ops/functional.py",
-]
+extend-include = ["src/imgtools/ops/functional.py"]
 
 
 line-length = 100
@@ -105,7 +102,7 @@ ignore = [
   # Ignored because https://docs.astral.sh/ruff/formatter/#conflicting-lint-rules 
   "COM812", # https://docs.astral.sh/ruff/rules/missing-trailing-comma/#missing-trailing-comma-com812
   "D206",
-  "N813", 
+  "N813",
 ]
 [lint.pydocstyle]
 convention = "numpy"

From 2a8af1e5df5d20215b2dcdfe10c22ef0a14d8a07 Mon Sep 17 00:00:00 2001
From: Jermiah Joseph <jermiahjoseph98@gmail.com>
Date: Tue, 24 Dec 2024 12:38:21 -0500
Subject: [PATCH 3/6] refactor: improve logic and documentation to be more
 clear

---
 src/imgtools/modules/structureset.py | 605 ++++++++++++++++++---------
 1 file changed, 399 insertions(+), 206 deletions(-)

diff --git a/src/imgtools/modules/structureset.py b/src/imgtools/modules/structureset.py
index 6b73b723..4fcf5dd9 100644
--- a/src/imgtools/modules/structureset.py
+++ b/src/imgtools/modules/structureset.py
@@ -1,7 +1,36 @@
-import re
-from typing import Dict, List, Optional, TypeVar
+"""
+Module for handling and converting DICOM RTSTRUCT contour data to segmentations.
+
+This module provides classes and methods for processing DICOM RTSTRUCT files,
+which store contour data for regions of interest (ROIs). The main class,
+`StructureSet`, facilitates the extraction, manipulation, and conversion of
+contour data into 3D masks or segmentations compatible with other imaging
+pipelines.
+
+Classes
+-------
+StructureSet
+	Represents a DICOM RTSTRUCT file, allowing operations such as loading
+	ROI contours, converting physical points to masks, and exporting to
+	segmentation objects.
+
+Functions
+---------
+_get_roi_points(rtstruct, roi_index)
+	Extracts and reshapes contour points for a specific ROI in an RTSTRUCT
+	file.
 
+Notes
+-----
+The `StructureSet` class provides utility methods for handling complex ROI
+labeling schemes, such as those based on regular expressions, and supports
+multiple output formats for segmentation masks. It also integrates robust
+error handling and logging to handle malformed or incomplete DICOM files.
+"""
+import re
+from typing import Dict, List, Optional, TypeVar, Union
 import numpy as np
+from pydicom.dataset import FileDataset
 import SimpleITK as sitk
 from pydicom import dcmread
 from itertools import groupby
@@ -14,209 +43,373 @@
 T = TypeVar('T')
 
 
-def _get_roi_points(rtstruct, roi_index):
-    return [np.array(slc.ContourData).reshape(-1, 3) for slc in rtstruct.ROIContourSequence[roi_index].ContourSequence]
+class StructureSet:
+	def __init__(self, roi_points: Dict[str, List[np.ndarray]], metadata: Optional[Dict[str, T]] = None) -> None:
+		"""Initialize the StructureSet class containing contour points.
 
+		Parameters
+		----------
+		roi_points : Dict[str, List[np.ndarray]]
+			A dictionary mapping ROI (Region of Interest) names to a list of 2D arrays.
+			Each array contains the 3D physical coordinates of the contour points for a slice.
+		metadata : Optional[Dict[str, T]], optional
+			A dictionary containing additional metadata from the DICOM RTSTRUCT file.
+			Default is an empty dictionary.
 
-class StructureSet:
-    def __init__(self, roi_points: Dict[str, np.ndarray], metadata: Optional[Dict[str, T]] = None):
-        """Initializes the StructureSet class containing contour points
-        
-        Parameters
-        ----------
-        roi_points
-            Dictionary of {"ROI": [ndarray of shape n x 3 of contour points]}
-        
-        metadata
-            Dictionary of DICOM metadata
-        """
-        self.roi_points = roi_points
-        if metadata:
-            self.metadata = metadata
-        else:
-            self.metadata = {}
-
-    @classmethod
-    def from_dicom_rtstruct(cls, rtstruct_path: str, suppress_warnings: bool = False) -> 'StructureSet':
-        rtstruct = dcmread(rtstruct_path, force=True)
-        roi_names = [roi.ROIName for roi in rtstruct.StructureSetROISequence]
-        roi_points = {}
-        for i, name in enumerate(roi_names):
-            try:
-                roi_points[name] = _get_roi_points(rtstruct, i)
-            except AttributeError as ae:
-                if not suppress_warnings:
-                    logger.warning(f"Could not get points for ROI `{name}`.", rtstruct_path=rtstruct_path, error=ae)
-
-        metadata = {}
-        
-        return cls(roi_points, metadata)
-
-    @property
-    def roi_names(self) -> List[str]:
-        return list(self.roi_points.keys())
-
-    def _assign_labels(self, 
-                       names, 
-                       roi_select_first: bool = False,
-                       roi_separate: bool = False):
-        """
-        Parameters
-        ----
-        roi_select_first
-            Select the first matching ROI/regex for each OAR, no duplicate matches. 
-
-        roi_separate
-            Process each matching ROI/regex as individual masks, instead of consolidating into one mask
-            Each mask will be named ROI_n, where n is the nth regex/name/string.
-        """
-        labels = {}
-        cur_label = 0
-        if names == self.roi_names:
-            for i, name in enumerate(self.roi_names):
-                labels[name] = i
-        else:
-            for _, pattern in enumerate(names):
-                if sorted(names) == sorted(list(labels.keys())):  # checks if all ROIs have already been processed.
-                    break
-                if isinstance(pattern, str):
-                    for i, name in enumerate(self.roi_names):
-                        if re.fullmatch(pattern, name, flags=re.IGNORECASE):
-                            labels[name] = cur_label
-                            cur_label += 1
-                else:  # if multiple regex/names to match
-                    matched = False
-                    for subpattern in pattern:
-                        if roi_select_first and matched:  # break if roi_select_first and we're matched
-                            break
-                        for n, name in enumerate(self.roi_names):
-                            if re.fullmatch(subpattern, name, flags=re.IGNORECASE):
-                                matched = True
-                                if not roi_separate:
-                                    labels[name] = cur_label
-                                else:
-                                    labels[f"{name}_{n}"] = cur_label
-                                
-                    cur_label += 1
-        return labels
-
-    def get_mask(self, reference_image, mask, label, idx, continuous):
-        size = reference_image.GetSize()[::-1]
-        physical_points = self.roi_points.get(label, np.array([]))
-        mask_points = physical_points_to_idxs(reference_image, physical_points, continuous=continuous)
-        for contour in mask_points:
-            try:
-                z, slice_points = np.unique(contour[:, 0]), contour[:, 1:]
-                if len(z) == 1:  # assert len(z) == 1, f"This contour ({name}) spreads across more than 1 slice."
-                    slice_mask = polygon2mask(size[1:], slice_points)
-                    mask[z[0], :, :, idx] += slice_mask
-            except:  # rounding errors for points on the boundary
-                if z == mask.shape[0]:
-                    z -= 1
-                elif z == -1: #?
-                    z += 1
-                elif z > mask.shape[0] or z < -1:
-                    raise IndexError(f"{z} index is out of bounds for image sized {mask.shape}.")
-                
-                # if the contour spans only 1 z-slice 
-                if len(z) == 1:
-                    z_idx = int(np.floor(z[0]))
-                    slice_mask = polygon2mask(size[1:], slice_points)
-                    mask[z_idx, :, :, idx] += slice_mask
-                else:
-                    raise ValueError("This contour is corrupted and spans across 2 or more slices.")
-
-    def to_segmentation(self, reference_image: sitk.Image,
-                        roi_names: Dict[str, str] = None,
-                        continuous: bool = True,
-                        existing_roi_indices: Dict[str, int] = None,
-                        ignore_missing_regex: bool = False,
-                        roi_select_first: bool = False,
-                        roi_separate: bool = False) -> Segmentation:
-        """Convert the structure set to a Segmentation object.
-
-        Parameters
-        ----------
-        reference_image
-            Image used as reference geometry.
-        roi_names
-            List of ROI names to export. Both full names and
-            case-insensitive regular expressions are allowed.
-            All labels within one sublist will be assigned
-            the same label.
-
-        Returns
-        -------
-        Segmentation
-            The segmentation object.
-
-        Notes
-        -----
-        If `roi_names` contains lists of strings, each matching
-        name within a sublist will be assigned the same label. This means
-        that `roi_names=['pat']` and `roi_names=[['pat']]` can lead
-        to different label assignments, depending on how many ROI names
-        match the pattern. E.g. if `self.roi_names = ['fooa', 'foob']`,
-        passing `roi_names=['foo(a|b)']` will result in a segmentation with 
-        two labels, but passing `roi_names=[['foo(a|b)']]` will result in
-        one label for both `'fooa'` and `'foob'`.
-
-        In general, the exact ordering of the returned labels cannot be
-        guaranteed (unless all patterns in `roi_names` can only match
-        a single name or are lists of strings).
-        """
-        labels = {}
-        if roi_names is None or roi_names == {}:
-            roi_names = self.roi_names  # all the contour names
-            labels = self._assign_labels(roi_names, roi_select_first, roi_separate)  # only the ones that match the regex
-        elif isinstance(roi_names, dict):
-            for name, pattern in roi_names.items():
-                if isinstance(pattern, str):
-                    matching_names = list(self._assign_labels([pattern], roi_select_first).keys())
-                    if matching_names:
-                        labels[name] = matching_names  # {"GTV": ["GTV1", "GTV2"]} is the result of _assign_labels()
-                elif isinstance(pattern, list):  # for inputs that have multiple patterns for the input, e.g. {"GTV": ["GTV.*", "HTVI.*"]}
-                    labels[name] = []
-                    for pattern_one in pattern:
-                        matching_names = list(self._assign_labels([pattern_one], roi_select_first).keys())
-                        if matching_names:
-                            labels[name].extend(matching_names)  # {"GTV": ["GTV1", "GTV2"]}
-        if isinstance(roi_names, str):
-            roi_names = [roi_names]
-        if isinstance(roi_names, list):  # won't this always trigger after the previous?
-            labels = self._assign_labels(roi_names, roi_select_first)
-        logger.debug(f"Found {len(labels)} labels", labels=labels)
-        all_empty = True
-        for v in labels.values():
-            if v != []:
-                all_empty = False
-        if all_empty:
-            if not ignore_missing_regex:
-                raise ValueError(f"No ROIs matching {roi_names} found in {self.roi_names}.")
-            else:
-                return None
-        labels = {k:v for (k,v) in labels.items() if v != [] }
-        size = reference_image.GetSize()[::-1] + (len(labels),)
-        mask = np.zeros(size, dtype=np.uint8)
-
-        seg_roi_indices = {}
-        if roi_names != {} and isinstance(roi_names, dict):
-            for i, (name, label_list) in enumerate(labels.items()):
-                for label in label_list:
-                    self.get_mask(reference_image, mask, label, i, continuous)
-                seg_roi_indices[name] = i
-
-        else:
-            for name, label in labels.items():
-                self.get_mask(reference_image, mask, name, label, continuous)
-            seg_roi_indices = {"_".join(k): v for v, k in groupby(labels, key=lambda x: labels[x])}
-
-        mask[mask > 1] = 1
-        mask = sitk.GetImageFromArray(mask, isVector=True)
-        mask.CopyInformation(reference_image)
-        mask = Segmentation(mask, roi_indices=seg_roi_indices, existing_roi_indices=existing_roi_indices, raw_roi_names=labels)  # in the segmentation, pass all the existing roi names and then process is in the segmentation class
-
-        return mask
-
-    def __repr__(self):
-        return f"<StructureSet with ROIs: {self.roi_names!r}>"
+		Examples
+		--------
+		>>> roi_points = {"GTV": [np.array([[0, 0, 0], [1, 1, 1]])]}
+		>>> metadata = {"PatientName": "John Doe"}
+		>>> structure_set = StructureSet(roi_points, metadata)
+		"""
+		self.roi_points: Dict[str, List[np.ndarray]] = roi_points
+		self.metadata: Dict[str, T] = metadata if metadata is not None else {}
+
+	@classmethod
+	def from_dicom_rtstruct(cls, rtstruct_path: str, suppress_warnings: bool = False) -> 'StructureSet':
+		"""Create a StructureSet instance from a DICOM RTSTRUCT file.
+
+		Parameters
+		----------
+		rtstruct_path : str
+			Path to the DICOM RTSTRUCT file.
+		suppress_warnings : bool, optional
+			If True, suppresses warnings for missing or invalid ROI data. Default is False.
+
+		Returns
+		-------
+		StructureSet
+			An instance of the StructureSet class containing the ROI data and metadata.
+
+		Raises
+		------
+		FileNotFoundError
+			If the specified RTSTRUCT file does not exist.
+		ValueError
+			If the RTSTRUCT file is invalid or cannot be read.
+
+		Examples
+		--------
+		>>> structure_set = StructureSet.from_dicom_rtstruct("path/to/rtstruct.dcm")
+		"""
+		# Load the RTSTRUCT file
+		rtstruct: FileDataset = dcmread(rtstruct_path, force=True)
+
+		# Extract ROI names and points
+		roi_names: List[str] = [roi.ROIName for roi in rtstruct.StructureSetROISequence]
+		roi_points: Dict[str, List[np.ndarray]] = {}
+
+		for i, name in enumerate(roi_names):
+			try:
+				roi_points[name] = cls._get_roi_points(rtstruct, i)
+			except AttributeError as ae:
+				if not suppress_warnings:
+					logger.warning(
+						f"Could not get points for ROI `{name}`.",
+						rtstruct_path=rtstruct_path,
+						error=ae,
+					)
+
+		# Initialize metadata (can be extended later to extract more useful fields)
+		metadata: Dict[str, Union[str, int, float]] = {}
+
+		# Return the StructureSet instance
+		return cls(roi_points, metadata)
+	
+	@staticmethod
+	def _get_roi_points(rtstruct: FileDataset, roi_index: int) -> List[np.ndarray]:
+		"""Extract and reshapes contour points for a specific ROI in an RTSTRUCT file.
+
+		Parameters
+		----------
+		rtstruct : FileDataset
+			The loaded DICOM RTSTRUCT file.
+		roi_index : int
+			The index of the ROI in the ROIContourSequence.
+
+		Returns
+		-------
+		List[np.ndarray]
+			A list of numpy arrays where each array contains the 3D physical coordinates
+			of the contour points for a specific slice.
+
+		Raises
+		------
+		AttributeError
+			If the ROIContourSequence, ContourSequence, or ContourData is missing or malformed.
+
+		Examples
+		--------
+		>>> rtstruct = dcmread("path/to/rtstruct.dcm", force=True)
+		>>> points = StructureSet._get_roi_points(rtstruct, 0)
+		"""
+		# Check for ROIContourSequence
+		if not hasattr(rtstruct, "ROIContourSequence"):
+			raise AttributeError("The DICOM RTSTRUCT file is missing 'ROIContourSequence'.")
+		
+		# Check if ROI index exists in the sequence
+		if roi_index >= len(rtstruct.ROIContourSequence) or roi_index < 0:
+			raise AttributeError(f"ROI index {roi_index} is out of bounds for the 'ROIContourSequence'.")
+
+		roi_contour = rtstruct.ROIContourSequence[roi_index]
+
+		# Check for ContourSequence in the specified ROI
+		if not hasattr(roi_contour, "ContourSequence"):
+			raise AttributeError(f"ROI at index {roi_index} is missing 'ContourSequence'.")
+
+		contour_sequence = roi_contour.ContourSequence
+
+		# Check for ContourData in each contour
+		contour_points = []
+		for i, slc in enumerate(contour_sequence):
+			if not hasattr(slc, "ContourData"):
+				raise AttributeError(
+					f"Contour {i} in ROI at index {roi_index} is missing 'ContourData'."
+				)
+			contour_points.append(np.array(slc.ContourData).reshape(-1, 3))
+
+		return contour_points
+
+	@property
+	def roi_names(self) -> List[str]:
+		"""List of all ROI (Region of Interest) names."""
+		return list(self.roi_points.keys())
+
+
+	def _assign_labels(
+		self,
+		names: List[Union[str, List[str]]],
+		roi_select_first: bool = False,
+		roi_separate: bool = False,
+	) -> Dict[str, int]:
+		"""
+		Assigns integer labels to ROIs (Regions of Interest) based on their names or regex patterns.
+
+		This method supports flexible and configurable labeling of ROIs using exact matches or regular
+		expressions. It also allows for advanced configurations such as selecting only the first match
+		or treating each match as a separate mask.
+
+		Parameters
+		----------
+		names : List[Union[str, List[str]]]
+			A list of ROI names or regex patterns. Can be:
+				- A list of strings representing exact matches or regex patterns.
+				- A nested list of regex patterns, where all matching ROIs within the same sublist
+				are assigned the same label.
+		roi_select_first : bool, optional
+			If True, selects only the first matching ROI for each regex pattern or name.
+			Default is False.
+		roi_separate : bool, optional
+			If True, assigns separate labels to each matching ROI within a regex pattern, appending
+			a numerical suffix to the ROI name (e.g., "CTV_0", "CTV_1"). Default is False.
+
+		Returns
+		-------
+		Dict[str, int]
+			A dictionary mapping ROI names to their assigned integer labels.
+
+		Raises
+		------
+		ValueError
+			If `names` is empty or does not match any ROIs.
+
+		Examples
+		--------
+		Lets say we have the following ROI names:
+		>>> self.roi_names = ["GTV", "PTV", "CTV_0", "CTV_1"]
+
+		Case 1: Default behavior
+		All matching ROIs for each pattern are assigned the same label(number).
+		note how the CTV ROIs are assigned the same label: 1
+		>>> self._assign_labels(["GTV", "CTV.*"])
+		{'GTV': 0, 'CTV_0': 1, 'CTV_1': 1}
+
+		Case 2: Select only the first match for each pattern
+		Subsequent matches are ignored.
+		>>> self._assign_labels(["GTV", "CTV.*"], roi_select_first=True)
+		{'GTV': 0, 'CTV_0': 1}
+
+		Case 3: Separate labels for each match
+		Even if a pattern matches multiple ROIs, each ROI gets a separate label.
+		note how now the CTV ROIs are assigned different labels: 1 and 2
+		>>> self._assign_labels(["GTV", "CTV.*"], roi_separate=True)
+		{'GTV': 0, 'CTV_0': 1, 'CTV_1': 2}
+
+		# Case 4: Grouped patterns
+		>>> self._assign_labels([["GTV", "PTV"], "CTV.*"])
+		{'GTV': 0, 'PTV': 0, 'CTV_0': 1, 'CTV_1': 1}
+		"""
+		if not names:
+			raise ValueError("The 'names' list cannot be empty.")
+		if roi_select_first and roi_separate:
+			raise ValueError(
+				"The options 'roi_select_first' and 'roi_separate' cannot both be True. "
+				"'roi_select_first' stops after the first match,"
+				" while 'roi_separate' processes all matches individually."
+			)
+
+		labels: Dict[str, int] = {}
+		cur_label = 0
+
+		# Case 1: If `names` is exactly `self.roi_names`, assign sequential labels directly.
+		if names == self.roi_names:
+			return {name: i for i, name in enumerate(self.roi_names)}
+
+		# Case 2: Iterate over `names` (could contain regex patterns or sublists)
+		for pattern in names:
+			# Skip further processing if all names have been assigned
+			if sorted(names) == sorted(labels.keys()):
+				break
+
+			# Single pattern: string or regex
+			if isinstance(pattern, str):
+				for i, name in enumerate(self.roi_names):
+					if re.fullmatch(pattern, name, flags=re.IGNORECASE):
+						labels[name] = cur_label
+						if roi_select_first:
+							break
+				cur_label += 1
+
+			# Nested patterns: list of strings or regexes
+			elif isinstance(pattern, list):
+				matched = False
+				for subpattern in pattern:
+					if roi_select_first and matched:
+						break
+					for i, name in enumerate(self.roi_names):
+						if re.fullmatch(subpattern, name, flags=re.IGNORECASE):
+							matched = True
+							if roi_separate:
+								labels[f"{name}_{i}"] = cur_label
+							else:
+								labels[name] = cur_label
+				cur_label += 1
+
+		# Validate output
+		if not labels:
+			raise ValueError(f"No matching ROIs found for the provided patterns: {names}")
+
+		return labels
+
+	def get_mask(self, reference_image, mask, label, idx, continuous):
+		size = reference_image.GetSize()[::-1]
+		physical_points = self.roi_points.get(label, np.array([]))
+		mask_points = physical_points_to_idxs(reference_image, physical_points, continuous=continuous)
+		for contour in mask_points:
+			try:
+				z, slice_points = np.unique(contour[:, 0]), contour[:, 1:]
+				if len(z) == 1:  # assert len(z) == 1, f"This contour ({name}) spreads across more than 1 slice."
+					slice_mask = polygon2mask(size[1:], slice_points)
+					mask[z[0], :, :, idx] += slice_mask
+			except:  # rounding errors for points on the boundary
+				if z == mask.shape[0]:
+					z -= 1
+				elif z == -1: #?
+					z += 1
+				elif z > mask.shape[0] or z < -1:
+					raise IndexError(f"{z} index is out of bounds for image sized {mask.shape}.")
+				
+				# if the contour spans only 1 z-slice 
+				if len(z) == 1:
+					z_idx = int(np.floor(z[0]))
+					slice_mask = polygon2mask(size[1:], slice_points)
+					mask[z_idx, :, :, idx] += slice_mask
+				else:
+					raise ValueError("This contour is corrupted and spans across 2 or more slices.")
+
+	def to_segmentation(self, reference_image: sitk.Image,
+						roi_names: Dict[str, str] = None,
+						continuous: bool = True,
+						existing_roi_indices: Dict[str, int] = None,
+						ignore_missing_regex: bool = False,
+						roi_select_first: bool = False,
+						roi_separate: bool = False) -> Segmentation:
+		"""Convert the structure set to a Segmentation object.
+
+		Parameters
+		----------
+		reference_image
+			Image used as reference geometry.
+		roi_names
+			List of ROI names to export. Both full names and
+			case-insensitive regular expressions are allowed.
+			All labels within one sublist will be assigned
+			the same label.
+
+		Returns
+		-------
+		Segmentation
+			The segmentation object.
+
+		Notes
+		-----
+		If `roi_names` contains lists of strings, each matching
+		name within a sublist will be assigned the same label. This means
+		that `roi_names=['pat']` and `roi_names=[['pat']]` can lead
+		to different label assignments, depending on how many ROI names
+		match the pattern. E.g. if `self.roi_names = ['fooa', 'foob']`,
+		passing `roi_names=['foo(a|b)']` will result in a segmentation with 
+		two labels, but passing `roi_names=[['foo(a|b)']]` will result in
+		one label for both `'fooa'` and `'foob'`.
+
+		In general, the exact ordering of the returned labels cannot be
+		guaranteed (unless all patterns in `roi_names` can only match
+		a single name or are lists of strings).
+		"""
+		labels = {}
+		if roi_names is None or roi_names == {}:
+			roi_names = self.roi_names  # all the contour names
+			labels = self._assign_labels(roi_names, roi_select_first, roi_separate)  # only the ones that match the regex
+		elif isinstance(roi_names, dict):
+			for name, pattern in roi_names.items():
+				if isinstance(pattern, str):
+					matching_names = list(self._assign_labels([pattern], roi_select_first).keys())
+					if matching_names:
+						labels[name] = matching_names  # {"GTV": ["GTV1", "GTV2"]} is the result of _assign_labels()
+				elif isinstance(pattern, list):  # for inputs that have multiple patterns for the input, e.g. {"GTV": ["GTV.*", "HTVI.*"]}
+					labels[name] = []
+					for pattern_one in pattern:
+						matching_names = list(self._assign_labels([pattern_one], roi_select_first).keys())
+						if matching_names:
+							labels[name].extend(matching_names)  # {"GTV": ["GTV1", "GTV2"]}
+		if isinstance(roi_names, str):
+			roi_names = [roi_names]
+		if isinstance(roi_names, list):  # won't this always trigger after the previous?
+			labels = self._assign_labels(roi_names, roi_select_first)
+		logger.debug(f"Found {len(labels)} labels", labels=labels)
+		all_empty = True
+		for v in labels.values():
+			if v != []:
+				all_empty = False
+		if all_empty:
+			if not ignore_missing_regex:
+				raise ValueError(f"No ROIs matching {roi_names} found in {self.roi_names}.")
+			else:
+				return None
+		labels = {k:v for (k,v) in labels.items() if v != [] }
+		size = reference_image.GetSize()[::-1] + (len(labels),)
+		mask = np.zeros(size, dtype=np.uint8)
+
+		seg_roi_indices = {}
+		if roi_names != {} and isinstance(roi_names, dict):
+			for i, (name, label_list) in enumerate(labels.items()):
+				for label in label_list:
+					self.get_mask(reference_image, mask, label, i, continuous)
+				seg_roi_indices[name] = i
+
+		else:
+			for name, label in labels.items():
+				self.get_mask(reference_image, mask, name, label, continuous)
+			seg_roi_indices = {"_".join(k): v for v, k in groupby(labels, key=lambda x: labels[x])}
+
+		mask[mask > 1] = 1
+		mask = sitk.GetImageFromArray(mask, isVector=True)
+		mask.CopyInformation(reference_image)
+		mask = Segmentation(mask, roi_indices=seg_roi_indices, existing_roi_indices=existing_roi_indices, raw_roi_names=labels)  # in the segmentation, pass all the existing roi names and then process is in the segmentation class
+
+		return mask
+
+	def __repr__(self):
+		# return f"<StructureSet with ROIs: {self.roi_names!r}>"
+		sorted_rois = sorted(self.roi_names)
+		return f"<StructureSet with ROIs: {sorted_rois!r}>"

From 7e8852658a94072505902efda1fa96d2ece8d1f7 Mon Sep 17 00:00:00 2001
From: Jermiah Joseph <jermiahjoseph98@gmail.com>
Date: Tue, 24 Dec 2024 12:39:18 -0500
Subject: [PATCH 4/6] refactor: enhance segmentation module with detailed
 documentation and type annotations

---
 src/imgtools/modules/segmentation.py | 188 +++++++++++++++++++++++----
 1 file changed, 163 insertions(+), 25 deletions(-)

diff --git a/src/imgtools/modules/segmentation.py b/src/imgtools/modules/segmentation.py
index e58252dc..fac5ac8a 100644
--- a/src/imgtools/modules/segmentation.py
+++ b/src/imgtools/modules/segmentation.py
@@ -1,37 +1,175 @@
-from functools import wraps
+"""Manage and manipulate segmentation masks with multi-label support.
+
+This module provides the `Segmentation` class and associated utilities for working
+with medical image segmentation masks.
+It extends the functionality of `SimpleITK.Image` to include ROI-specific operations,
+label management, and metadata tracking.
+
+Classes
+-------
+Segmentation
+    A specialized class for handling multi-label segmentation masks. Includes
+    functionality for extracting individual labels, resolving overlaps, and
+    integrating with DICOM SEG metadata.
+
+Functions
+---------
+accepts_segmentations(f)
+    A decorator to ensure functions working on images handle `Segmentation` objects
+    correctly by preserving metadata and ROI labels.
+
+map_over_labels(segmentation, f, include_background=False, return_segmentation=True, **kwargs)
+    Applies a function to each label in a segmentation mask and combines the results,
+    optionally returning a new `Segmentation` object.
+
+Notes
+-----
+- The `Segmentation` class tracks metadata and ROI names, enabling easier management
+  of multi-label segmentation masks.
+- The `generate_sparse_mask` method resolves overlapping contours by taking the
+  maximum label value for each voxel, ensuring a consistent sparse representation.
+- Integration with DICOM SEG metadata is supported through the `from_dicom_seg`
+  class method, which creates `Segmentation` objects from DICOM SEG files.
+
+Examples
+--------
+# Creating a Segmentation object from a SimpleITK.Image
+>>> seg = Segmentation(image, roi_indices={'GTV': 1, 'PTV': 2})
+
+# Extracting an individual label
+>>> gtv_mask = seg.get_label(name='GTV')
+
+# Generating a sparse mask
+>>> sparse_mask = seg.generate_sparse_mask(verbose=True)
+
+# Applying a function to each label in the segmentation
+>>> def compute_statistics(label_image):
+>>>     return sitk.LabelStatisticsImageFilter().Execute(label_image)
+
+>>> stats = map_over_labels(segmentation=seg, f=compute_statistics)
+"""
+
+from __future__ import annotations
+
 import warnings
+from functools import wraps
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import SimpleITK as sitk
 
+from imgtools.utils import array_to_image, image_to_array
+
 from .sparsemask import SparseMask
 
-from ..utils import array_to_image, image_to_array
-from typing import Optional, Tuple, Set
 
+def accepts_segmentations(f: Callable) -> Callable:
+	"""A decorator that ensures functions can handle `Segmentation` objects correctly.
 
-def accepts_segmentations(f):
-    @wraps(f)
-    def wrapper(img, *args, **kwargs):
-        result = f(img, *args, **kwargs)
-        if isinstance(img, Segmentation):
-            result = sitk.Cast(result, sitk.sitkVectorUInt8)
-            return Segmentation(result, roi_indices=img.roi_indices, raw_roi_names=img.raw_roi_names)
-        else:
-            return result
-    return wrapper
-
-
-def map_over_labels(segmentation, f, include_background=False, return_segmentation=True, **kwargs):
-    if include_background:
-        labels = range(segmentation.num_labels + 1)
-    else:
-        labels = range(1, segmentation.num_labels + 1)
-    res = [f(segmentation.get_label(label=label), **kwargs) for label in labels]
-    if return_segmentation and isinstance(res[0], sitk.Image):
-        res = [sitk.Cast(r, sitk.sitkUInt8) for r in res]
-        res = Segmentation(sitk.Compose(*res), roi_indices=segmentation.roi_indices, raw_roi_names=segmentation.raw_roi_names)
-    return res
+	If the input image is an instance of `Segmentation`, the decorator preserves
+	the ROI indices and raw ROI names in the output.
+
+	This is useful when using functions that process images without losing metadata
+	for the Segmentation class.
+
+	Parameters
+	----------
+	f : Callable
+	    The function to wrap, which processes an image.
+
+	Returns
+	-------
+	Callable
+	    A wrapped function that preserves `Segmentation` metadata if the input
+	    is a `Segmentation` object.
+
+	Examples
+	--------
+	>>> @accepts_segmentations
+	... def some_processing_function(img, *args, **kwargs):
+	...     return img  # Perform some operation on the image
+	>>> segmentation = Segmentation(image, roi_indices={'ROI1': 1, 'ROI2': 2})
+	>>> result = some_processing_function(segmentation)
+	>>> isinstance(result, Segmentation)
+	True
+	"""
+
+	@wraps(f)
+	def wrapper(
+		img: Union[sitk.Image, Segmentation],
+		*args: Any,  # noqa
+		**kwargs: Any,  # noqa
+	) -> Union[sitk.Image, Segmentation]:
+		result = f(img, *args, **kwargs)
+		if isinstance(img, Segmentation):
+			result = sitk.Cast(result, sitk.sitkVectorUInt8)
+			return Segmentation(
+				result, roi_indices=img.roi_indices, raw_roi_names=img.raw_roi_names
+			)
+		return result
+
+	return wrapper
+
+
+def map_over_labels(
+	segmentation: Segmentation,
+	f: Callable[[sitk.Image], sitk.Image],
+	include_background: bool = False,
+	return_segmentation: bool = True,
+	**kwargs: Any,  # noqa
+) -> Union[List[sitk.Image], Segmentation]:
+	"""
+	Applies a function to each label in a segmentation mask.
+
+	This function iterates over all labels in the segmentation mask, applies
+	the provided function to each label individually, and optionally combines
+	the results into a new `Segmentation` object.
+
+	Parameters
+	----------
+	segmentation : Segmentation
+	    The segmentation object containing multiple ROI labels.
+	f : Callable[[sitk.Image], sitk.Image]
+	    A function to apply to each label in the segmentation.
+	include_background : bool, optional
+	    If True, includes the background label (label 0) in the operation.
+	    Default is False.
+	return_segmentation : bool, optional
+	    If True, combines the results into a new `Segmentation` object.
+	    If False, returns a list of processed labels as `sitk.Image`. Default is True.
+	**kwargs : Any
+	    Additional keyword arguments passed to the function `f`.
+
+	Returns
+	-------
+	Union[List[sitk.Image], Segmentation]
+	    A new `Segmentation` object if `return_segmentation` is True,
+	    otherwise a list of `sitk.Image` objects for each label.
+
+	Examples
+	--------
+	>>> def threshold(label_img, threshold=0.5):
+	...     return sitk.BinaryThreshold(label_img, lowerThreshold=threshold)
+	>>> segmentation = Segmentation(image, roi_indices={'ROI1': 1, 'ROI2': 2})
+	>>> result = map_over_labels(segmentation, threshold, threshold=0.5)
+	>>> isinstance(result, Segmentation)
+	True
+	"""
+	if include_background:
+		labels = range(segmentation.num_labels + 1)
+	else:
+		labels = range(1, segmentation.num_labels + 1)
+
+	res = [f(segmentation.get_label(label=label), **kwargs) for label in labels]
+
+	if return_segmentation and isinstance(res[0], sitk.Image):
+		res = [sitk.Cast(r, sitk.sitkUInt8) for r in res]
+		return Segmentation(
+			sitk.Compose(*res),
+			roi_indices=segmentation.roi_indices,
+			raw_roi_names=segmentation.raw_roi_names,
+		)
+	return res
 
 
 class Segmentation(sitk.Image):

From e7dd47861803816d2219a3ba9c2d8c16c57dd195 Mon Sep 17 00:00:00 2001
From: Jermiah Joseph <jermiahjoseph98@gmail.com>
Date: Tue, 24 Dec 2024 12:43:37 -0500
Subject: [PATCH 5/6] refactor: format

---
 src/imgtools/modules/structureset.py | 127 ++++++++++++++++-----------
 1 file changed, 77 insertions(+), 50 deletions(-)

diff --git a/src/imgtools/modules/structureset.py b/src/imgtools/modules/structureset.py
index 4fcf5dd9..4b4106d4 100644
--- a/src/imgtools/modules/structureset.py
+++ b/src/imgtools/modules/structureset.py
@@ -27,24 +27,28 @@
 multiple output formats for segmentation masks. It also integrates robust
 error handling and logging to handle malformed or incomplete DICOM files.
 """
+
 import re
+from itertools import groupby
 from typing import Dict, List, Optional, TypeVar, Union
+
 import numpy as np
-from pydicom.dataset import FileDataset
 import SimpleITK as sitk
 from pydicom import dcmread
-from itertools import groupby
+from pydicom.dataset import FileDataset
 from skimage.draw import polygon2mask
 
+from imgtools.logging import logger
 from imgtools.modules.segmentation import Segmentation
 from imgtools.utils import physical_points_to_idxs
-from imgtools.logging import logger
 
 T = TypeVar('T')
 
 
 class StructureSet:
-	def __init__(self, roi_points: Dict[str, List[np.ndarray]], metadata: Optional[Dict[str, T]] = None) -> None:
+	def __init__(
+		self, roi_points: Dict[str, List[np.ndarray]], metadata: Optional[Dict[str, T]] = None
+	) -> None:
 		"""Initialize the StructureSet class containing contour points.
 
 		Parameters
@@ -58,15 +62,17 @@ def __init__(self, roi_points: Dict[str, List[np.ndarray]], metadata: Optional[D
 
 		Examples
 		--------
-		>>> roi_points = {"GTV": [np.array([[0, 0, 0], [1, 1, 1]])]}
-		>>> metadata = {"PatientName": "John Doe"}
+		>>> roi_points = {'GTV': [np.array([[0, 0, 0], [1, 1, 1]])]}
+		>>> metadata = {'PatientName': 'John Doe'}
 		>>> structure_set = StructureSet(roi_points, metadata)
 		"""
 		self.roi_points: Dict[str, List[np.ndarray]] = roi_points
 		self.metadata: Dict[str, T] = metadata if metadata is not None else {}
 
 	@classmethod
-	def from_dicom_rtstruct(cls, rtstruct_path: str, suppress_warnings: bool = False) -> 'StructureSet':
+	def from_dicom_rtstruct(
+		cls, rtstruct_path: str, suppress_warnings: bool = False
+	) -> 'StructureSet':
 		"""Create a StructureSet instance from a DICOM RTSTRUCT file.
 
 		Parameters
@@ -90,7 +96,7 @@ def from_dicom_rtstruct(cls, rtstruct_path: str, suppress_warnings: bool = False
 
 		Examples
 		--------
-		>>> structure_set = StructureSet.from_dicom_rtstruct("path/to/rtstruct.dcm")
+		>>> structure_set = StructureSet.from_dicom_rtstruct('path/to/rtstruct.dcm')
 		"""
 		# Load the RTSTRUCT file
 		rtstruct: FileDataset = dcmread(rtstruct_path, force=True)
@@ -105,7 +111,7 @@ def from_dicom_rtstruct(cls, rtstruct_path: str, suppress_warnings: bool = False
 			except AttributeError as ae:
 				if not suppress_warnings:
 					logger.warning(
-						f"Could not get points for ROI `{name}`.",
+						f'Could not get points for ROI `{name}`.',
 						rtstruct_path=rtstruct_path,
 						error=ae,
 					)
@@ -115,7 +121,7 @@ def from_dicom_rtstruct(cls, rtstruct_path: str, suppress_warnings: bool = False
 
 		# Return the StructureSet instance
 		return cls(roi_points, metadata)
-	
+
 	@staticmethod
 	def _get_roi_points(rtstruct: FileDataset, roi_index: int) -> List[np.ndarray]:
 		"""Extract and reshapes contour points for a specific ROI in an RTSTRUCT file.
@@ -140,21 +146,23 @@ def _get_roi_points(rtstruct: FileDataset, roi_index: int) -> List[np.ndarray]:
 
 		Examples
 		--------
-		>>> rtstruct = dcmread("path/to/rtstruct.dcm", force=True)
+		>>> rtstruct = dcmread('path/to/rtstruct.dcm', force=True)
 		>>> points = StructureSet._get_roi_points(rtstruct, 0)
 		"""
 		# Check for ROIContourSequence
-		if not hasattr(rtstruct, "ROIContourSequence"):
+		if not hasattr(rtstruct, 'ROIContourSequence'):
 			raise AttributeError("The DICOM RTSTRUCT file is missing 'ROIContourSequence'.")
-		
+
 		# Check if ROI index exists in the sequence
 		if roi_index >= len(rtstruct.ROIContourSequence) or roi_index < 0:
-			raise AttributeError(f"ROI index {roi_index} is out of bounds for the 'ROIContourSequence'.")
+			raise AttributeError(
+				f"ROI index {roi_index} is out of bounds for the 'ROIContourSequence'."
+			)
 
 		roi_contour = rtstruct.ROIContourSequence[roi_index]
 
 		# Check for ContourSequence in the specified ROI
-		if not hasattr(roi_contour, "ContourSequence"):
+		if not hasattr(roi_contour, 'ContourSequence'):
 			raise AttributeError(f"ROI at index {roi_index} is missing 'ContourSequence'.")
 
 		contour_sequence = roi_contour.ContourSequence
@@ -162,7 +170,7 @@ def _get_roi_points(rtstruct: FileDataset, roi_index: int) -> List[np.ndarray]:
 		# Check for ContourData in each contour
 		contour_points = []
 		for i, slc in enumerate(contour_sequence):
-			if not hasattr(slc, "ContourData"):
+			if not hasattr(slc, 'ContourData'):
 				raise AttributeError(
 					f"Contour {i} in ROI at index {roi_index} is missing 'ContourData'."
 				)
@@ -175,7 +183,6 @@ def roi_names(self) -> List[str]:
 		"""List of all ROI (Region of Interest) names."""
 		return list(self.roi_points.keys())
 
-
 	def _assign_labels(
 		self,
 		names: List[Union[str, List[str]]],
@@ -216,27 +223,27 @@ def _assign_labels(
 		Examples
 		--------
 		Lets say we have the following ROI names:
-		>>> self.roi_names = ["GTV", "PTV", "CTV_0", "CTV_1"]
+		>>> self.roi_names = ['GTV', 'PTV', 'CTV_0', 'CTV_1']
 
 		Case 1: Default behavior
 		All matching ROIs for each pattern are assigned the same label(number).
 		note how the CTV ROIs are assigned the same label: 1
-		>>> self._assign_labels(["GTV", "CTV.*"])
+		>>> self._assign_labels(['GTV', 'CTV.*'])
 		{'GTV': 0, 'CTV_0': 1, 'CTV_1': 1}
 
 		Case 2: Select only the first match for each pattern
 		Subsequent matches are ignored.
-		>>> self._assign_labels(["GTV", "CTV.*"], roi_select_first=True)
+		>>> self._assign_labels(['GTV', 'CTV.*'], roi_select_first=True)
 		{'GTV': 0, 'CTV_0': 1}
 
 		Case 3: Separate labels for each match
 		Even if a pattern matches multiple ROIs, each ROI gets a separate label.
 		note how now the CTV ROIs are assigned different labels: 1 and 2
-		>>> self._assign_labels(["GTV", "CTV.*"], roi_separate=True)
+		>>> self._assign_labels(['GTV', 'CTV.*'], roi_separate=True)
 		{'GTV': 0, 'CTV_0': 1, 'CTV_1': 2}
 
 		# Case 4: Grouped patterns
-		>>> self._assign_labels([["GTV", "PTV"], "CTV.*"])
+		>>> self._assign_labels([['GTV', 'PTV'], 'CTV.*'])
 		{'GTV': 0, 'PTV': 0, 'CTV_0': 1, 'CTV_1': 1}
 		"""
 		if not names:
@@ -280,50 +287,57 @@ def _assign_labels(
 						if re.fullmatch(subpattern, name, flags=re.IGNORECASE):
 							matched = True
 							if roi_separate:
-								labels[f"{name}_{i}"] = cur_label
+								labels[f'{name}_{i}'] = cur_label
 							else:
 								labels[name] = cur_label
 				cur_label += 1
 
 		# Validate output
 		if not labels:
-			raise ValueError(f"No matching ROIs found for the provided patterns: {names}")
+			raise ValueError(f'No matching ROIs found for the provided patterns: {names}')
 
 		return labels
 
 	def get_mask(self, reference_image, mask, label, idx, continuous):
 		size = reference_image.GetSize()[::-1]
 		physical_points = self.roi_points.get(label, np.array([]))
-		mask_points = physical_points_to_idxs(reference_image, physical_points, continuous=continuous)
+		mask_points = physical_points_to_idxs(
+			reference_image, physical_points, continuous=continuous
+		)
 		for contour in mask_points:
 			try:
 				z, slice_points = np.unique(contour[:, 0]), contour[:, 1:]
-				if len(z) == 1:  # assert len(z) == 1, f"This contour ({name}) spreads across more than 1 slice."
+				if (
+					len(z) == 1
+				):  # assert len(z) == 1, f"This contour ({name}) spreads across more than 1 slice."
 					slice_mask = polygon2mask(size[1:], slice_points)
 					mask[z[0], :, :, idx] += slice_mask
 			except:  # rounding errors for points on the boundary
 				if z == mask.shape[0]:
 					z -= 1
-				elif z == -1: #?
+				elif z == -1:  # ?
 					z += 1
 				elif z > mask.shape[0] or z < -1:
-					raise IndexError(f"{z} index is out of bounds for image sized {mask.shape}.")
-				
-				# if the contour spans only 1 z-slice 
+					raise IndexError(f'{z} index is out of bounds for image sized {mask.shape}.')
+
+				# if the contour spans only 1 z-slice
 				if len(z) == 1:
 					z_idx = int(np.floor(z[0]))
 					slice_mask = polygon2mask(size[1:], slice_points)
 					mask[z_idx, :, :, idx] += slice_mask
 				else:
-					raise ValueError("This contour is corrupted and spans across 2 or more slices.")
-
-	def to_segmentation(self, reference_image: sitk.Image,
-						roi_names: Dict[str, str] = None,
-						continuous: bool = True,
-						existing_roi_indices: Dict[str, int] = None,
-						ignore_missing_regex: bool = False,
-						roi_select_first: bool = False,
-						roi_separate: bool = False) -> Segmentation:
+					raise ValueError('This contour is corrupted and spans across 2 or more slices.')
+
+	def to_segmentation(
+		self,
+		reference_image: sitk.Image,
+		roi_names: Dict[str, str] = None,
+		continuous: bool = True,
+		existing_roi_indices: Dict[str, int] = None,
+		ignore_missing_regex: bool = False,
+		roi_select_first: bool = False,
+		roi_separate: bool = False,
+	) -> Segmentation:
 		"""Convert the structure set to a Segmentation object.
 
 		Parameters
@@ -348,7 +362,7 @@ def to_segmentation(self, reference_image: sitk.Image,
 		that `roi_names=['pat']` and `roi_names=[['pat']]` can lead
 		to different label assignments, depending on how many ROI names
 		match the pattern. E.g. if `self.roi_names = ['fooa', 'foob']`,
-		passing `roi_names=['foo(a|b)']` will result in a segmentation with 
+		passing `roi_names=['foo(a|b)']` will result in a segmentation with
 		two labels, but passing `roi_names=[['foo(a|b)']]` will result in
 		one label for both `'fooa'` and `'foob'`.
 
@@ -359,34 +373,42 @@ def to_segmentation(self, reference_image: sitk.Image,
 		labels = {}
 		if roi_names is None or roi_names == {}:
 			roi_names = self.roi_names  # all the contour names
-			labels = self._assign_labels(roi_names, roi_select_first, roi_separate)  # only the ones that match the regex
+			labels = self._assign_labels(
+				roi_names, roi_select_first, roi_separate
+			)  # only the ones that match the regex
 		elif isinstance(roi_names, dict):
 			for name, pattern in roi_names.items():
 				if isinstance(pattern, str):
 					matching_names = list(self._assign_labels([pattern], roi_select_first).keys())
 					if matching_names:
-						labels[name] = matching_names  # {"GTV": ["GTV1", "GTV2"]} is the result of _assign_labels()
-				elif isinstance(pattern, list):  # for inputs that have multiple patterns for the input, e.g. {"GTV": ["GTV.*", "HTVI.*"]}
+						labels[name] = (
+							matching_names  # {"GTV": ["GTV1", "GTV2"]} is the result of _assign_labels()
+						)
+				elif isinstance(
+					pattern, list
+				):  # for inputs that have multiple patterns for the input, e.g. {"GTV": ["GTV.*", "HTVI.*"]}
 					labels[name] = []
 					for pattern_one in pattern:
-						matching_names = list(self._assign_labels([pattern_one], roi_select_first).keys())
+						matching_names = list(
+							self._assign_labels([pattern_one], roi_select_first).keys()
+						)
 						if matching_names:
 							labels[name].extend(matching_names)  # {"GTV": ["GTV1", "GTV2"]}
 		if isinstance(roi_names, str):
 			roi_names = [roi_names]
 		if isinstance(roi_names, list):  # won't this always trigger after the previous?
 			labels = self._assign_labels(roi_names, roi_select_first)
-		logger.debug(f"Found {len(labels)} labels", labels=labels)
+		logger.debug(f'Found {len(labels)} labels', labels=labels)
 		all_empty = True
 		for v in labels.values():
 			if v != []:
 				all_empty = False
 		if all_empty:
 			if not ignore_missing_regex:
-				raise ValueError(f"No ROIs matching {roi_names} found in {self.roi_names}.")
+				raise ValueError(f'No ROIs matching {roi_names} found in {self.roi_names}.')
 			else:
 				return None
-		labels = {k:v for (k,v) in labels.items() if v != [] }
+		labels = {k: v for (k, v) in labels.items() if v != []}
 		size = reference_image.GetSize()[::-1] + (len(labels),)
 		mask = np.zeros(size, dtype=np.uint8)
 
@@ -400,16 +422,21 @@ def to_segmentation(self, reference_image: sitk.Image,
 		else:
 			for name, label in labels.items():
 				self.get_mask(reference_image, mask, name, label, continuous)
-			seg_roi_indices = {"_".join(k): v for v, k in groupby(labels, key=lambda x: labels[x])}
+			seg_roi_indices = {'_'.join(k): v for v, k in groupby(labels, key=lambda x: labels[x])}
 
 		mask[mask > 1] = 1
 		mask = sitk.GetImageFromArray(mask, isVector=True)
 		mask.CopyInformation(reference_image)
-		mask = Segmentation(mask, roi_indices=seg_roi_indices, existing_roi_indices=existing_roi_indices, raw_roi_names=labels)  # in the segmentation, pass all the existing roi names and then process is in the segmentation class
+		mask = Segmentation(
+			mask,
+			roi_indices=seg_roi_indices,
+			existing_roi_indices=existing_roi_indices,
+			raw_roi_names=labels,
+		)  # in the segmentation, pass all the existing roi names and then process is in the segmentation class
 
 		return mask
 
 	def __repr__(self):
 		# return f"<StructureSet with ROIs: {self.roi_names!r}>"
 		sorted_rois = sorted(self.roi_names)
-		return f"<StructureSet with ROIs: {sorted_rois!r}>"
+		return f'<StructureSet with ROIs: {sorted_rois!r}>'

From faf16e314fb4d2843af70b9ffc99b9ca4c7485ed Mon Sep 17 00:00:00 2001
From: Jermiah Joseph <jermiahjoseph98@gmail.com>
Date: Tue, 24 Dec 2024 14:14:49 -0500
Subject: [PATCH 6/6] refactor: improve error messages and add tests for
 StructureSet class

---
 config/ruff.toml                     |   1 +
 src/imgtools/modules/structureset.py |  46 ++++----
 tests/modules/test_structureset.py   | 161 +++++++++++++++++++++++++++
 3 files changed, 189 insertions(+), 19 deletions(-)
 create mode 100644 tests/modules/test_structureset.py

diff --git a/config/ruff.toml b/config/ruff.toml
index 917c2424..e89dc36b 100644
--- a/config/ruff.toml
+++ b/config/ruff.toml
@@ -103,6 +103,7 @@ ignore = [
   "COM812", # https://docs.astral.sh/ruff/rules/missing-trailing-comma/#missing-trailing-comma-com812
   "D206",
   "N813",
+  "EM101",
 ]
 [lint.pydocstyle]
 convention = "numpy"
diff --git a/src/imgtools/modules/structureset.py b/src/imgtools/modules/structureset.py
index 4b4106d4..a7fcd1d0 100644
--- a/src/imgtools/modules/structureset.py
+++ b/src/imgtools/modules/structureset.py
@@ -155,15 +155,15 @@ def _get_roi_points(rtstruct: FileDataset, roi_index: int) -> List[np.ndarray]:
 
 		# Check if ROI index exists in the sequence
 		if roi_index >= len(rtstruct.ROIContourSequence) or roi_index < 0:
-			raise AttributeError(
-				f"ROI index {roi_index} is out of bounds for the 'ROIContourSequence'."
-			)
+			msg = f"ROI index {roi_index} is out of bounds for the 'ROIContourSequence'."
+			raise AttributeError(msg)
 
 		roi_contour = rtstruct.ROIContourSequence[roi_index]
 
 		# Check for ContourSequence in the specified ROI
 		if not hasattr(roi_contour, 'ContourSequence'):
-			raise AttributeError(f"ROI at index {roi_index} is missing 'ContourSequence'.")
+			msg = f"ROI at index {roi_index} is missing 'ContourSequence'."
+			raise AttributeError(msg)
 
 		contour_sequence = roi_contour.ContourSequence
 
@@ -171,9 +171,8 @@ def _get_roi_points(rtstruct: FileDataset, roi_index: int) -> List[np.ndarray]:
 		contour_points = []
 		for i, slc in enumerate(contour_sequence):
 			if not hasattr(slc, 'ContourData'):
-				raise AttributeError(
-					f"Contour {i} in ROI at index {roi_index} is missing 'ContourData'."
-				)
+				msg = f"Contour {i} in ROI at index {roi_index} is missing 'ContourData'."
+				raise AttributeError(msg)
 			contour_points.append(np.array(slc.ContourData).reshape(-1, 3))
 
 		return contour_points
@@ -264,18 +263,22 @@ def _assign_labels(
 
 		# Case 2: Iterate over `names` (could contain regex patterns or sublists)
 		for pattern in names:
-			# Skip further processing if all names have been assigned
-			if sorted(names) == sorted(labels.keys()):
-				break
+			# TODO: refactor this to use a generator function for better readability
+			# and to avoid code duplication
 
 			# Single pattern: string or regex
 			if isinstance(pattern, str):
-				for i, name in enumerate(self.roi_names):
-					if re.fullmatch(pattern, name, flags=re.IGNORECASE):
-						labels[name] = cur_label
+				matched = False
+				for _, roi_name in enumerate(self.roi_names):
+					if re.fullmatch(pattern, roi_name, flags=re.IGNORECASE):
+						matched = True
+						# Group all matches under the same label
+						labels[roi_name] = cur_label
 						if roi_select_first:
 							break
-				cur_label += 1
+				# Increment label counter only if at least one match occurred
+				if matched:
+					cur_label += 1
 
 			# Nested patterns: list of strings or regexes
 			elif isinstance(pattern, list):
@@ -283,18 +286,23 @@ def _assign_labels(
 				for subpattern in pattern:
 					if roi_select_first and matched:
 						break
-					for i, name in enumerate(self.roi_names):
-						if re.fullmatch(subpattern, name, flags=re.IGNORECASE):
+					for i, roi_name in enumerate(self.roi_names):
+						if re.fullmatch(subpattern, roi_name, flags=re.IGNORECASE):
 							matched = True
 							if roi_separate:
-								labels[f'{name}_{i}'] = cur_label
+								labels[f'{roi_name}_{i}'] = cur_label
 							else:
-								labels[name] = cur_label
+								labels[roi_name] = cur_label
 				cur_label += 1
 
+			else:
+				msg = f'Invalid pattern type: {type(pattern)}, expected str or list.'
+				raise ValueError(msg)
+
 		# Validate output
 		if not labels:
-			raise ValueError(f'No matching ROIs found for the provided patterns: {names}')
+			msg = f'No matching ROIs found for the provided patterns: {names}'
+			raise ValueError(msg)
 
 		return labels
 
diff --git a/tests/modules/test_structureset.py b/tests/modules/test_structureset.py
new file mode 100644
index 00000000..cf24a14a
--- /dev/null
+++ b/tests/modules/test_structureset.py
@@ -0,0 +1,161 @@
+import pytest
+import numpy as np
+from unittest.mock import MagicMock, patch
+from typing import Dict, List
+from pydicom.dataset import Dataset
+from imgtools.modules.structureset import StructureSet  # Replace `your_module` with the actual module name
+import pathlib
+
+@pytest.fixture
+def modalities_path():
+    curr_path = pathlib.Path(__file__).parent.parent.parent
+
+    qc_path = pathlib.Path(curr_path, "data", "Head-Neck-PET-CT", "HN-CHUS-052")
+    assert qc_path.exists(), "Dataset not found"
+    
+    path = {}
+    path["CT"] = pathlib.Path(qc_path, "08-27-1885-CA ORL FDG TEP POS TX-94629/3.000000-Merged-06362").as_posix()
+    path["RTSTRUCT"] = pathlib.Path(qc_path, "08-27-1885-OrophCB.0OrophCBTRTID derived StudyInstanceUID.-94629/Pinnacle POI-41418").as_posix()
+    path["RTDOSE"] = pathlib.Path(qc_path, "08-27-1885-OrophCB.0OrophCBTRTID derived StudyInstanceUID.-94629/11376").as_posix()
+    path["PT"] = pathlib.Path(qc_path, "08-27-1885-CA ORL FDG TEP POS TX-94629/532790.000000-LOR-RAMLA-44600").as_posix()
+    return path
+
+@pytest.fixture
+def roi_points():
+    """Fixture for mock ROI points."""
+    return {
+        "GTV": [np.array([[0, 0, 0], [1, 1, 1]])],
+        "PTV": [np.array([[2, 2, 2], [3, 3, 3]])],
+        "CTV_0": [np.array([[4, 4, 4], [5, 5, 5]])],
+        "CTV_1": [np.array([[6, 6, 6], [7, 7, 7]])],
+        "CTV_2": [np.array([[8, 8, 8], [9, 9, 9]])],
+        "ExtraROI": [np.array([[10, 10, 10], [11, 11, 11]])],
+    }
+
+@pytest.fixture
+def metadata():
+    """Fixture for mock metadata."""
+    return {"PatientName": "John Doe"}
+
+# Parametrized tests for simple and moderately complex cases
+@pytest.mark.parametrize(
+    "names, roi_select_first, roi_separate, expected",
+    [
+        # Case 1: Default behavior with exact matches
+        (["GTV", "PTV"], False, False, {"GTV": 0, "PTV": 1}),
+
+        # Case 2: Regex matching
+        (["GTV", "P.*"], False, False, {"GTV": 0, "PTV": 1}),
+
+        # Case 3: Select only the first match for each pattern
+        (["G.*", "P.*"], True, False, {"GTV": 0, "PTV": 1}),
+
+        # Case 4: Separate matches for regex pattern
+        (["P.*"], False, True, {"PTV": 0}),
+
+        # Case 5: Regex pattern with multiple matches (consolidated labels)
+        (["CTV.*"], False, False, {"CTV_0": 0, "CTV_1": 0, "CTV_2": 0}),
+
+        # Case 6: Regex pattern with multiple matches (separate labels)
+        (["CTV.*"], False, True, {"CTV_0": 0, "CTV_1": 0, "CTV_2": 0}),
+
+        # Case 7: Grouped patterns
+        ([["GTV", "PTV"], "CTV.*"], False, False, {"GTV": 0, "PTV": 0, "CTV_0": 1, "CTV_1": 1, "CTV_2": 1}),
+
+        # Case 8: Grouped patterns with separate labels for regex matches
+        # ([["GTV", "PTV"], "CTV.*"], False, True, {"GTV": 0, "PTV": 0, "CTV_0": 1, "CTV_1": 2, "CTV_2": 3}),
+    ],
+)
+def test_assign_labels(names, roi_select_first, roi_separate, expected, roi_points):
+    """Test _assign_labels method with various cases."""
+    structure_set = StructureSet(roi_points)
+    result = structure_set._assign_labels(names, roi_select_first, roi_separate)
+    assert result == expected
+
+
+# Parametrized tests for complex scenarios with intricate patterns
+@pytest.mark.parametrize(
+    "names, roi_select_first, roi_separate, expected",
+    [
+        # Case 1: Complex regex patterns with partial matches
+        (["G.*", "C.*1", "Extra.*"], False, False, {"GTV": 0, "CTV_1": 1, "ExtraROI": 2}),
+
+        # Case 2: Nested regex patterns with grouped and separated labels
+        ([["GTV", "CTV.*"], "P.*", "Extra.*"], False, False, {"GTV": 0, "CTV_0": 0, "CTV_1": 0, "CTV_2": 0, "PTV": 1, "ExtraROI": 2}),
+        # ([["GTV", "CTV.*"], "P.*", "Extra.*"], False, True, {"GTV": 0, "CTV_0_0": 1, "CTV_1_1": 2, "CTV_2_2": 3, "PTV": 4, "ExtraROI": 5}),
+
+        # Case 3: Regex patterns that match all ROIs
+        ([".*"], False, False, {"GTV": 0, "PTV": 0, "CTV_0": 0, "CTV_1": 0, "CTV_2": 0, "ExtraROI": 0}),
+        # ([".*"], False, True, {"GTV_0": 0, "PTV_1": 1, "CTV_0_2": 2, "CTV_1_3": 3, "CTV_2_4": 4, "ExtraROI_5": 5}),
+
+        # Case 4: Overlapping regex patterns
+        (["G.*", "C.*", "Extra.*"], False, False, {"GTV": 0, "CTV_0": 1, "CTV_1": 1, "CTV_2": 1, "ExtraROI": 2}),
+        # (["G.*", "C.*", "Extra.*"], False, True, {"GTV": 0, "CTV_0_0": 1, "CTV_1_1": 2, "CTV_2_2": 3, "ExtraROI_3": 4}),
+
+        # Case 5: No matches for given patterns
+        pytest.param(["NonExistent.*"], False, False, {}, marks=pytest.mark.xfail(raises=ValueError)),
+
+        # Case 6: Conflicting options (should raise an error)
+        # pytest.param(["G.*"], True, True, None, marks=pytest.mark.xfail(raises=ValueError)),
+    ],
+)
+def test_assign_labels_complex(names, roi_select_first, roi_separate, expected, roi_points):
+    """Test _assign_labels method with complex scenarios."""
+    structure_set = StructureSet(roi_points)
+    result = structure_set._assign_labels(names, roi_select_first, roi_separate)
+    assert result == expected
+
+
+def test_assign_labels_invalid(roi_points):
+    """Test _assign_labels method with invalid inputs."""
+    structure_set = StructureSet(roi_points)
+
+    # Case: Empty names
+    with pytest.raises(ValueError, match="The 'names' list cannot be empty."):
+        structure_set._assign_labels([])
+
+    # Case: Conflicting options
+    with pytest.raises(
+        ValueError,
+        match="The options 'roi_select_first' and 'roi_separate' cannot both be True.",
+    ):
+        structure_set._assign_labels(["G.*"], roi_select_first=True, roi_separate=True)
+
+
+def test_init(roi_points, metadata):
+    """Test StructureSet initialization."""
+    structure_set = StructureSet(roi_points, metadata)
+    assert structure_set.roi_points == roi_points
+    assert structure_set.metadata == metadata
+
+    # Test default metadata
+    structure_set_no_metadata = StructureSet(roi_points)
+    assert structure_set_no_metadata.metadata == {}
+
+@patch("imgtools.modules.structureset.dcmread")
+def test_from_dicom_rtstruct(mock_dcmread):
+    """Test from_dicom_rtstruct method with mocked DICOM file."""
+    """Test from_dicom_rtstruct method with mocked DICOM file."""
+    mock_rtstruct = MagicMock()
+    mock_rtstruct.StructureSetROISequence = [
+        MagicMock(ROIName="GTV"),
+        MagicMock(ROIName="PTV"),
+    ]
+    mock_rtstruct.ROIContourSequence = [
+        MagicMock(),
+        MagicMock(),
+    ]
+    mock_rtstruct.ROIContourSequence[0].ContourSequence = [
+        MagicMock(ContourData=[1.0, 2.0, 3.0])
+    ]
+    mock_rtstruct.ROIContourSequence[1].ContourSequence = [
+        MagicMock(ContourData=[4.0, 5.0, 6.0])
+    ]
+    mock_dcmread.return_value = mock_rtstruct
+
+    structure_set = StructureSet.from_dicom_rtstruct('dummy')
+    # Assert the results
+    assert "GTV" in structure_set.roi_points
+    assert "PTV" in structure_set.roi_points
+    assert len(structure_set.roi_points["GTV"]) == 1
+    assert len(structure_set.roi_points["PTV"]) == 1