Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More robust name validation #703

Open
wants to merge 17 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ and this project adheres to [Semantic Versioning][].

## [0.2.3] - 2024-09-25

### Changed

- Validation of element and column names allowing `_-.` and alphanumeric characters #624

### Minor

- Added `clip: bool = False` parameter to `polygon_query()` #670
Expand Down
15 changes: 15 additions & 0 deletions docs/design_doc.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,21 @@ _SpatialData_ follows the OME-NGFF specifications whenever possible and therefor
- Any `Element` MAY be annotated by `Tables`; also `Shapes` and `Points` MAY contain annotations within themselves as additional dataframe columns (e.g. intensity of point spread function of a each point, or gene id).
- `Tables` CAN NOT be annotated by other `Tables`.

#### Naming

Names of SpatialData elements must fulfill certain restrictions to ensure robust storage and compatibility:

- MUST NOT be the empty string ``.
- MUST only contain alphanumeric characters or hyphens `-`, dots `.`, underscores `_`. Alphanumeric includes letters from different alphabets and number-like symbols, but excludes whitespace, slashes and other symbols.
- MUST NOT be the full strings `.` or `..`, which would have special meanings as file paths.
- MUST NOT start with double underscores `__`.
- MUST NOT only differ in character case, to avoid name collisions on case-insensitive systems.

In tables, the above restrictions apply to the column names of `obs` and `var`, and to the key names of the `obsm`,
`obsp`, `varm`, `varp`, `uns`, `layers` slots (for example `adata.obs['has space']` and `adata.layers['.']` are not
allowed).
Additionally, dataframes in tables MUST NOT have a column named `_index`, which is reserved.

#### Images

Images of a sample. Should conform to the [OME-NGFF concept of an image](https://ngff.openmicroscopy.org/latest/#image-layout).
Expand Down
27 changes: 14 additions & 13 deletions src/spatialdata/_core/_elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from dask.dataframe import DataFrame as DaskDataFrame
from geopandas import GeoDataFrame

from spatialdata._core.validation import check_key_is_case_insensitively_unique, check_valid_name
from spatialdata._types import Raster_T
from spatialdata.models import (
Image2DModel,
Expand All @@ -30,30 +31,30 @@ def __init__(self, shared_keys: set[str | None]) -> None:
self._shared_keys = shared_keys
super().__init__()

@staticmethod
def _check_valid_name(name: str) -> None:
if not isinstance(name, str):
raise TypeError(f"Name must be a string, not {type(name).__name__}.")
if len(name) == 0:
raise ValueError("Name cannot be an empty string.")
if not all(c.isalnum() or c in "_-" for c in name):
raise ValueError("Name must contain only alphanumeric characters, underscores, and hyphens.")
def _add_shared_key(self, key: str) -> None:
self._shared_keys.add(key)

def _remove_shared_key(self, key: str) -> None:
self._shared_keys.remove(key)

@staticmethod
def _check_key(key: str, element_keys: Iterable[str], shared_keys: set[str | None]) -> None:
Elements._check_valid_name(key)
check_valid_name(key)
if key in element_keys:
warn(f"Key `{key}` already exists. Overwriting it in-memory.", UserWarning, stacklevel=2)
else:
if key in shared_keys:
raise KeyError(f"Key `{key}` already exists.")
try:
check_key_is_case_insensitively_unique(key, shared_keys)
except ValueError as e:
# Validation raises ValueError, but inappropriate mapping key must raise KeyError.
raise KeyError(*e.args) from e

def __setitem__(self, key: str, value: Any) -> None:
self._shared_keys.add(key)
self._add_shared_key(key)
super().__setitem__(key, value)

def __delitem__(self, key: str) -> None:
self._shared_keys.remove(key)
self._remove_shared_key(key)
super().__delitem__(key)


Expand Down
97 changes: 59 additions & 38 deletions src/spatialdata/_core/spatialdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,13 @@
from xarray import DataArray

from spatialdata._core._elements import Images, Labels, Points, Shapes, Tables
from spatialdata._core.validation import (
check_all_keys_case_insensitively_unique,
check_target_region_column_symmetry,
check_valid_name,
raise_validation_errors,
validate_table_attr_keys,
)
from spatialdata._logging import logger
from spatialdata._types import ArrayLike, Raster_T
from spatialdata._utils import _deprecation_alias, _error_message_add_element
Expand All @@ -33,7 +40,6 @@
PointsModel,
ShapesModel,
TableModel,
check_target_region_column_symmetry,
get_model,
get_table_keys,
)
Expand Down Expand Up @@ -137,26 +143,37 @@ def __init__(
f"Element names must be unique. The following element names are used multiple times: {duplicates}"
)

if images is not None:
for k, v in images.items():
self.images[k] = v

if labels is not None:
for k, v in labels.items():
self.labels[k] = v

if shapes is not None:
for k, v in shapes.items():
self.shapes[k] = v

if points is not None:
for k, v in points.items():
self.points[k] = v

if tables is not None:
for k, v in tables.items():
self.validate_table_in_spatialdata(v)
self.tables[k] = v
with raise_validation_errors(
title="Cannot construct SpatialData object, input contains invalid elements.\n"
"For renaming, please see the discussion here https://github.com/scverse/spatialdata/discussions/707 .",
exc_type=(ValueError, KeyError),
) as collect_error:

if images is not None:
for k, v in images.items():
with collect_error(location=("images", k)):
self.images[k] = v

if labels is not None:
for k, v in labels.items():
with collect_error(location=("labels", k)):
self.labels[k] = v

if shapes is not None:
for k, v in shapes.items():
with collect_error(location=("shapes", k)):
self.shapes[k] = v

if points is not None:
for k, v in points.items():
with collect_error(location=("points", k)):
self.points[k] = v

if tables is not None:
for k, v in tables.items():
with collect_error(location=("tables", k)):
self.validate_table_in_spatialdata(v)
self.tables[k] = v

self._query = QueryManager(self)

Expand Down Expand Up @@ -1115,6 +1132,20 @@ def _validate_can_safely_write_to_path(
" the current Zarr store." + WORKAROUND
)

def _validate_all_elements(self) -> None:
with raise_validation_errors(
title="SpatialData contains elements with invalid names.\n"
"For renaming, please see the discussion here https://github.com/scverse/spatialdata/discussions/707 .",
exc_type=ValueError,
) as collect_error:
for element_type, element_name, element in self.gen_elements():
element_path = (element_type, element_name)
with collect_error(location=element_path):
check_valid_name(element_name)
if element_type == "tables":
with collect_error(location=element_path):
validate_table_attr_keys(element, location=element_path)

def write(
self,
file_path: str | Path,
Expand Down Expand Up @@ -1150,6 +1181,7 @@ def write(
if isinstance(file_path, str):
file_path = Path(file_path)
self._validate_can_safely_write_to_path(file_path, overwrite=overwrite)
self._validate_all_elements()

store = parse_url(file_path, mode="w").store
_ = zarr.group(store=store, overwrite=overwrite)
Expand Down Expand Up @@ -1244,9 +1276,7 @@ def write_element(
self.write_element(name, overwrite=overwrite)
return

from spatialdata._core._elements import Elements

Elements._check_valid_name(element_name)
check_valid_name(element_name)
self._validate_element_names_are_unique()
element = self.get(element_name)
if element is None:
Expand All @@ -1265,6 +1295,8 @@ def write_element(
break
if element_type is None:
raise ValueError(f"Element with name {element_name} not found in SpatialData object.")
if element_type == "tables":
validate_table_attr_keys(element)

self._check_element_not_on_disk_with_different_type(element_type=element_type, element_name=element_name)

Expand Down Expand Up @@ -1316,11 +1348,8 @@ def delete_element_from_disk(self, element_name: str | list[str]) -> None:
self.delete_element_from_disk(name)
return

from spatialdata._core._elements import Elements
from spatialdata._io._utils import _backed_elements_contained_in_path

Elements._check_valid_name(element_name)

if self.path is None:
raise ValueError("The SpatialData object is not backed by a Zarr store.")

Expand Down Expand Up @@ -1451,10 +1480,8 @@ def write_transformations(self, element_name: str | None = None) -> None:
element_name
The name of the element to write. If None, write the transformations of all elements.
"""
from spatialdata._core._elements import Elements

if element_name is not None:
Elements._check_valid_name(element_name)
check_valid_name(element_name)

# recursively write the transformation for all the SpatialElement
if element_name is None:
Expand Down Expand Up @@ -1541,10 +1568,8 @@ def write_metadata(self, element_name: str | None = None, consolidate_metadata:
-----
When using the methods `write()` and `write_element()`, the metadata is written automatically.
"""
from spatialdata._core._elements import Elements

if element_name is not None:
Elements._check_valid_name(element_name)
check_valid_name(element_name)

self.write_transformations(element_name)
# TODO: write .uns['spatialdata_attrs'] metadata for AnnData.
Expand Down Expand Up @@ -1994,11 +2019,7 @@ def _validate_element_names_are_unique(self) -> None:
ValueError
If the element names are not unique.
"""
element_names = set()
for _, element_name, _ in self.gen_elements():
if element_name in element_names:
raise ValueError(f"Element name {element_name!r} is not unique.")
element_names.add(element_name)
check_all_keys_case_insensitively_unique([name for _, name, _ in self.gen_elements()], location=())

def _find_element(self, element_name: str) -> tuple[str, str, SpatialElement | AnnData]:
"""
Expand Down
Loading
Loading